diff --git a/modules/dnn/CMakeLists.txt b/modules/dnn/CMakeLists.txt
index 415ced70b47..8a38a824699 100644
--- a/modules/dnn/CMakeLists.txt
+++ b/modules/dnn/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 
 set(the_description "Deep neural network module. It allows to load models from different frameworks and to make forward pass")
 
-ocv_add_module(dnn opencv_core opencv_imgproc WRAP python matlab)
+ocv_add_module(dnn opencv_core opencv_imgproc)
 ocv_warnings_disable(CMAKE_CXX_FLAGS -Wno-shadow -Wno-parentheses -Wmaybe-uninitialized -Wsign-promo
                                      -Wmissing-declarations -Wmissing-prototypes
 )
diff --git a/modules/dnn/include/opencv2/dnn/all_layers.hpp b/modules/dnn/include/opencv2/dnn/all_layers.hpp
index 42bd2811f32..70d8687fe31 100644
--- a/modules/dnn/include/opencv2/dnn/all_layers.hpp
+++ b/modules/dnn/include/opencv2/dnn/all_layers.hpp
@@ -72,12 +72,18 @@ namespace dnn
   - Dropout (since it does nothing on forward pass -))
 */
 
+    class CV_EXPORTS BlankLayer : public Layer
+    {
+    public:
+        static Ptr<BlankLayer> create(const LayerParams &params);
+    };
+
     //! LSTM recurrent layer
-    class CV_EXPORTS_W LSTMLayer : public Layer
+    class CV_EXPORTS LSTMLayer : public Layer
     {
     public:
         /** Creates instance of LSTM layer */
-        static CV_WRAP Ptr<LSTMLayer> create();
+        static Ptr<LSTMLayer> create(const LayerParams& params);
 
         /** Set trained weights for LSTM layer.
         LSTM behavior on each step is defined by current input, previous output, previous cell state and learned weights.
@@ -109,27 +115,27 @@ namespace dnn
         @param Wx is matrix defining how current input is transformed to internal gates (i.e. according to abovemtioned notation is @f$ W_x @f$)
         @param b  is bias vector (i.e. according to abovemtioned notation is @f$ b @f$)
         */
-        CV_WRAP virtual void setWeights(const Blob &Wh, const Blob &Wx, const Blob &b) = 0;
+        virtual void setWeights(const Mat &Wh, const Mat &Wx, const Mat &b) = 0;
 
         /** @brief Specifies shape of output blob which will be [[`T`], `N`] + @p outTailShape.
           * @details If this parameter is empty or unset then @p outTailShape = [`Wh`.size(0)] will be used,
           * where `Wh` is parameter from setWeights().
           */
-        CV_WRAP virtual void setOutShape(const BlobShape &outTailShape = BlobShape::empty()) = 0;
+        virtual void setOutShape(const std::vector<int> &outTailShape = std::vector<int>()) = 0;
 
         /** @brief Set @f$ h_{t-1} @f$ value that will be used in next forward() calls.
           * @details By-default @f$ h_{t-1} @f$ is inited by zeros and updated after each forward() call.
           */
-        CV_WRAP virtual void setH(const Blob &H) = 0;
+        virtual void setH(const Mat &H) = 0;
         /** @brief Returns current @f$ h_{t-1} @f$ value (deep copy). */
-        CV_WRAP virtual Blob getH() const = 0;
+        virtual Mat getH() const = 0;
 
         /** @brief Set @f$ c_{t-1} @f$ value that will be used in next forward() calls.
           * @details By-default @f$ c_{t-1} @f$ is inited by zeros and updated after each forward() call.
           */
-        CV_WRAP virtual void setC(const Blob &C) = 0;
+        virtual void setC(const Mat &C) = 0;
         /** @brief Returns current @f$ c_{t-1} @f$ value (deep copy). */
-        CV_WRAP virtual Blob getC() const = 0;
+        virtual Mat getC() const = 0;
 
         /** @brief Specifies either interpet first dimension of input blob as timestamp dimenion either as sample.
           *
@@ -139,14 +145,14 @@ namespace dnn
           * If flag is set to false then shape of input blob will be interpeted as [`N`, `[data dims]`].
           * In this case each forward() call will make one iteration and produce one timestamp with shape [`N`, `[out dims]`].
           */
-        CV_WRAP virtual void setUseTimstampsDim(bool use = true) = 0;
+        virtual void setUseTimstampsDim(bool use = true) = 0;
 
         /** @brief If this flag is set to true then layer will produce @f$ c_t @f$ as second output.
          * @details Shape of the second output is the same as first output.
          */
-        CV_WRAP virtual void setProduceCellOutput(bool produce = false) = 0;
+        virtual void setProduceCellOutput(bool produce = false) = 0;
 
-        /** In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$).
+        /* In common case it use single input with @f$x_t@f$ values to compute output(s) @f$h_t@f$ (and @f$c_t@f$).
          * @param input should contain packed values @f$x_t@f$
          * @param output contains computed outputs: @f$h_t@f$ (and @f$c_t@f$ if setProduceCellOutput() flag was set to true).
          *
@@ -156,19 +162,17 @@ namespace dnn
          * If setUseTimstampsDim() is set to fase then @p input[0] should contain single timestamp, its shape should has form [`N`, `[data dims]`] with at least one dimension.
          * (i.e. @f$ x_{t}^{stream} @f$ is stored inside @p input[0][stream, ...]).
         */
-        void forward(std::vector<Blob*> &input, std::vector<Blob> &output);
 
         int inputNameToIndex(String inputName);
-
         int outputNameToIndex(String outputName);
     };
 
     //! Classical recurrent layer
-    class CV_EXPORTS_W RNNLayer : public Layer
+    class CV_EXPORTS RNNLayer : public Layer
     {
     public:
         /** Creates instance of RNNLayer */
-        static CV_WRAP Ptr<RNNLayer> create();
+        static Ptr<RNNLayer> create(const LayerParams& params);
 
         /** Setups learned weights.
 
@@ -184,12 +188,12 @@ namespace dnn
         @param Who is @f$ W_{xo} @f$ matrix
         @param bo  is @f$ b_{o}  @f$ vector
         */
-        CV_WRAP virtual void setWeights(const Blob &Wxh, const Blob &bh, const Blob &Whh, const Blob &Who, const Blob &bo) = 0;
+        virtual void setWeights(const Mat &Wxh, const Mat &bh, const Mat &Whh, const Mat &Who, const Mat &bo) = 0;
 
         /** @brief If this flag is set to true then layer will produce @f$ h_t @f$ as second output.
          * @details Shape of the second output is the same as first output.
          */
-        CV_WRAP virtual void setProduceHiddenOutput(bool produce = false) = 0;
+        virtual void setProduceHiddenOutput(bool produce = false) = 0;
 
         /** Accepts two inputs @f$x_t@f$ and @f$h_{t-1}@f$ and compute two outputs @f$o_t@f$ and @f$h_t@f$.
 
@@ -200,57 +204,49 @@ namespace dnn
 
         @p output[0] will have shape [`T`, `N`, @f$N_o@f$], where @f$N_o@f$ is number of rows in @f$ W_{xo} @f$ matrix.
 
-        If setProduceHiddenOutput() is set to true then @p output[1] will contain a Blob with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
+        If setProduceHiddenOutput() is set to true then @p output[1] will contain a Mat with shape [`T`, `N`, @f$N_h@f$], where @f$N_h@f$ is number of rows in @f$ W_{hh} @f$ matrix.
         */
-        void forward(std::vector<Blob*> &input, std::vector<Blob> &output);
     };
 
-    class CV_EXPORTS_W BaseConvolutionLayer : public Layer
+    class CV_EXPORTS BaseConvolutionLayer : public Layer
     {
     public:
-
-        CV_PROP_RW Size kernel, stride, pad, dilation, adjustPad;
-        CV_PROP_RW String padMode;
+        Size kernel, stride, pad, dilation, adjustPad;
+        String padMode;
     };
 
-    class CV_EXPORTS_W ConvolutionLayer : public BaseConvolutionLayer
+    class CV_EXPORTS ConvolutionLayer : public BaseConvolutionLayer
     {
     public:
-
-        static CV_WRAP Ptr<BaseConvolutionLayer> create(Size kernel = Size(3, 3), Size stride = Size(1, 1), Size pad = Size(0, 0), Size dilation = Size(1, 1));
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W DeconvolutionLayer : public BaseConvolutionLayer
+    class CV_EXPORTS DeconvolutionLayer : public BaseConvolutionLayer
     {
     public:
-
-        static CV_WRAP Ptr<BaseConvolutionLayer> create(Size kernel = Size(3, 3), Size stride = Size(1, 1), Size pad = Size(0, 0), Size dilation = Size(1, 1), Size adjustPad = Size());
+        static Ptr<BaseConvolutionLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W LRNLayer : public Layer
+    class CV_EXPORTS LRNLayer : public Layer
     {
     public:
-
         enum Type
         {
             CHANNEL_NRM,
             SPATIAL_NRM
         };
-        CV_PROP_RW int type;
+        int type;
 
-        CV_PROP_RW int size;
-        CV_PROP_RW double alpha, beta, bias;
-        CV_PROP_RW bool normBySize;
+        int size;
+        float alpha, beta, bias;
+        bool normBySize;
 
-        static CV_WRAP Ptr<LRNLayer> create(int type = LRNLayer::CHANNEL_NRM, int size = 5,
-                                            double alpha = 1, double beta = 0.75, double bias = 1,
-                                            bool normBySize = true);
+        static Ptr<LRNLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W PoolingLayer : public Layer
+    class CV_EXPORTS PoolingLayer : public Layer
     {
     public:
-
         enum Type
         {
             MAX,
@@ -258,139 +254,146 @@ namespace dnn
             STOCHASTIC
         };
 
-        CV_PROP_RW int type;
-        CV_PROP_RW Size kernel, stride, pad;
-        CV_PROP_RW bool globalPooling;
-        CV_PROP_RW String padMode;
+        int type;
+        Size kernel, stride, pad;
+        bool globalPooling;
+        String padMode;
 
-        static CV_WRAP Ptr<PoolingLayer> create(int type = PoolingLayer::MAX, Size kernel = Size(2, 2),
-                                                Size stride = Size(1, 1), Size pad = Size(0, 0),
-                                                const cv::String& padMode = "");
-        static CV_WRAP Ptr<PoolingLayer> createGlobal(int type = PoolingLayer::MAX);
+        static Ptr<PoolingLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W SoftmaxLayer : public Layer
+    class CV_EXPORTS SoftmaxLayer : public Layer
     {
     public:
-
-        static CV_WRAP Ptr<SoftmaxLayer> create(int axis = 1);
+        static Ptr<SoftmaxLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W InnerProductLayer : public Layer
+    class CV_EXPORTS InnerProductLayer : public Layer
     {
     public:
-        CV_PROP_RW int axis;
-
-        static CV_WRAP Ptr<InnerProductLayer> create(int axis = 1);
+        int axis;
+        static Ptr<InnerProductLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W MVNLayer : public Layer
+    class CV_EXPORTS MVNLayer : public Layer
     {
     public:
-        CV_PROP_RW double eps;
-        CV_PROP_RW bool normVariance, acrossChannels;
+        float eps;
+        bool normVariance, acrossChannels;
 
-        static CV_WRAP Ptr<MVNLayer> create(bool normVariance = true, bool acrossChannels = false, double eps = 1e-9);
+        static Ptr<MVNLayer> create(const LayerParams& params);
     };
 
     /* Reshaping */
 
-    class CV_EXPORTS_W ReshapeLayer : public Layer
+    class CV_EXPORTS ReshapeLayer : public Layer
     {
     public:
-        CV_PROP_RW BlobShape newShapeDesc;
-        CV_PROP_RW Range newShapeRange;
+        std::vector<int> newShapeDesc;
+        Range newShapeRange;
+
+        static Ptr<ReshapeLayer> create(const LayerParams& params);
+    };
 
-        static CV_WRAP Ptr<ReshapeLayer> create(const BlobShape &newShape, Range applyingRange = Range::all(),
-                                                bool enableReordering = false);
+    class CV_EXPORTS FlattenLayer : public Layer
+    {
+    public:
+        static Ptr<FlattenLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W ConcatLayer : public Layer
+    class CV_EXPORTS ConcatLayer : public Layer
     {
     public:
         int axis;
 
-        static CV_WRAP Ptr<ConcatLayer> create(int axis = 1);
+        static Ptr<ConcatLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W SplitLayer : public Layer
+    class CV_EXPORTS SplitLayer : public Layer
     {
     public:
         int outputsCount; //!< Number of copies that will be produced (is ignored when negative).
 
-        static CV_WRAP Ptr<SplitLayer> create(int outputsCount = -1);
+        static Ptr<SplitLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W SliceLayer : public Layer
+    class CV_EXPORTS SliceLayer : public Layer
     {
     public:
-        CV_PROP_RW int axis;
-        CV_PROP std::vector<int> sliceIndices;
+        int axis;
+        std::vector<int> sliceIndices;
 
-        static CV_WRAP Ptr<SliceLayer> create(int axis);
-        static CV_WRAP Ptr<SliceLayer> create(int axis, const std::vector<int> &sliceIndices);
+        static Ptr<SliceLayer> create(const LayerParams &params);
     };
 
-    /* Activations */
-
-    class CV_EXPORTS_W ReLULayer : public Layer
+    class CV_EXPORTS PermuteLayer : public Layer
     {
     public:
-        CV_PROP_RW double negativeSlope;
+        static Ptr<PermuteLayer> create(const LayerParams& params);
+    };
 
-        static CV_WRAP Ptr<ReLULayer> create(double negativeSlope = 0);
+    class CV_EXPORTS PaddingLayer : public Layer
+    {
+    public:
+        static Ptr<PaddingLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W ChannelsPReLULayer : public Layer
+    /* Activations */
+
+    class CV_EXPORTS ReLULayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<ChannelsPReLULayer> create();
+        static Ptr<ReLULayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W TanHLayer : public Layer
+    class CV_EXPORTS ChannelsPReLULayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<TanHLayer> create();
+        static Ptr<ChannelsPReLULayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W SigmoidLayer : public Layer
+    class CV_EXPORTS TanHLayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<SigmoidLayer> create();
+        static Ptr<TanHLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W BNLLLayer : public Layer
+    class CV_EXPORTS SigmoidLayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<BNLLLayer> create();
+        static Ptr<SigmoidLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W AbsLayer : public Layer
+    class CV_EXPORTS BNLLLayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<AbsLayer> create();
+        static Ptr<BNLLLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W PowerLayer : public Layer
+    class CV_EXPORTS AbsLayer : public Layer
     {
     public:
-        CV_PROP_RW double power, scale, shift;
+        static Ptr<AbsLayer> create(const LayerParams &params);
+    };
 
-        static CV_WRAP Ptr<PowerLayer> create(double power = 1, double scale = 1, double shift = 0);
+    class CV_EXPORTS PowerLayer : public Layer
+    {
+    public:
+        static Ptr<PowerLayer> create(const LayerParams &params);
     };
 
     /* Layers using in semantic segmentation */
 
-    class CV_EXPORTS_W CropLayer : public Layer
+    class CV_EXPORTS CropLayer : public Layer
     {
     public:
-        CV_PROP int startAxis;
-        CV_PROP std::vector<int> offset;
+        int startAxis;
+        std::vector<int> offset;
 
-        static Ptr<CropLayer> create(int start_axis, const std::vector<int> &offset);
+        static Ptr<CropLayer> create(const LayerParams &params);
     };
 
-    class CV_EXPORTS_W EltwiseLayer : public Layer
+    class CV_EXPORTS EltwiseLayer : public Layer
     {
     public:
         enum EltwiseOp
@@ -400,25 +403,49 @@ namespace dnn
             MAX = 2,
         };
 
-        static Ptr<EltwiseLayer> create(EltwiseOp op, const std::vector<int> &coeffs);
+        static Ptr<EltwiseLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS BatchNormLayer : public Layer
+    {
+    public:
+        static Ptr<BatchNormLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS MaxUnpoolLayer : public Layer
+    {
+    public:
+        static Ptr<MaxUnpoolLayer> create(const LayerParams &params);
+    };
+
+    class CV_EXPORTS ScaleLayer : public Layer
+    {
+    public:
+        static Ptr<ScaleLayer> create(const LayerParams& params);
+    };
+
+    class CV_EXPORTS ShiftLayer : public Layer
+    {
+    public:
+        static Ptr<ShiftLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W BatchNormLayer : public Layer
+    class CV_EXPORTS PriorBoxLayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<BatchNormLayer> create(bool hasWeights, bool hasBias, float epsilon);
+        static Ptr<PriorBoxLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W MaxUnpoolLayer : public Layer
+    class CV_EXPORTS DetectionOutputLayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<MaxUnpoolLayer> create(Size poolKernel, Size poolPad, Size poolStride);
+        static Ptr<DetectionOutputLayer> create(const LayerParams& params);
     };
 
-    class CV_EXPORTS_W ScaleLayer : public Layer
+    class NormalizeBBoxLayer : public Layer
     {
     public:
-        static CV_WRAP Ptr<ScaleLayer> create(bool hasBias);
+        static Ptr<NormalizeBBoxLayer> create(const LayerParams& params);
     };
 
 //! @}
diff --git a/modules/dnn/include/opencv2/dnn/blob.hpp b/modules/dnn/include/opencv2/dnn/blob.hpp
deleted file mode 100644
index 71e929de353..00000000000
--- a/modules/dnn/include/opencv2/dnn/blob.hpp
+++ /dev/null
@@ -1,341 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_DNN_BLOB_HPP__
-#define __OPENCV_DNN_DNN_BLOB_HPP__
-#include <opencv2/core.hpp>
-#include <vector>
-#include <ostream>
-#include <iostream>
-
-namespace cv
-{
-namespace dnn
-{
-//! @addtogroup dnn
-//! @{
-
-    /** @brief Lightweight class for storing and processing a shape of blob (or anything else). */
-    struct CV_EXPORTS_W BlobShape
-    {
-        BlobShape();                                        //!< Creates [1, 1, 1, 1] shape @todo Make more clearer behavior.
-        explicit BlobShape(int s0);                         //!< Creates 1-dim shape [@p s0]
-        BlobShape(int s0, int s1);                          //!< @overload
-        BlobShape(int s0, int s1, int s2);                  //!< @overload
-        BlobShape(int num, int cn, int rows, int cols);     //!< Creates 4-dim shape [@p num, @p cn, @p rows, @p cols]
-
-        //! Creates n-dim shape from the @p sizes array; if @p sizes is NULL then shape will contain unspecified data
-        BlobShape(int ndims, const int *sizes);
-        BlobShape(const std::vector<int> &sizes);           //!< Creates n-dim shape from the @p sizes vector
-        template<int n>
-        BlobShape(const Vec<int, n> &shape);                //!< Creates n-dim shape from @ref cv::Vec
-
-        //! Creates n-dim shape and fill its by @p fill
-        static BlobShape all(int ndims, int fill = 1);
-
-        /** @brief Returns number of dimensions. */
-        int dims() const;
-
-        /** @brief Returns reference to the size of the specified @p axis.
-         *
-         * Negative @p axis is supported, in this case a counting starts from the last axis,
-         * i. e. -1 corresponds to last axis.
-         * If non-existing axis was passed then an error will be generated.
-         */
-        int &size(int axis);
-
-        /** @brief Returns the size of the specified @p axis.
-         *  @see size()
-         */
-        int size(int axis) const;
-
-        int operator[](int axis) const; //!< Does the same thing as size(axis).
-        int &operator[](int axis);      //!< Does the same thing as size(int) const.
-
-        /** @brief Returns the size of the specified @p axis.
-         *
-         * Does the same thing as size(int) const, but if non-existing axis will be passed then 1 will be returned,
-         * therefore this function always finishes successfully.
-         */
-        int xsize(int axis) const;
-
-        /** @brief Converts @p axis index to canonical format (where 0 <= @p axis < dims()). */
-        int canonicalAxis(int axis) const;
-
-        /** @brief Returns the product of all sizes of axes. */
-        ptrdiff_t total() const;
-
-        /** @brief Computes the product of sizes of axes among the specified axes range [@p startAxis; @p endAxis).
-         * @details Negative axis indexing can be used. @sa Blob::total(int,int)
-         */
-        ptrdiff_t total(int startAxis, int endAxis = INT_MAX) const;
-
-        /** @brief Constructs new shape from axes in range [@p startAxis; @p endAxis).
-         * @details Negative axis indexing can be used. @sa Blob::total(int,int)
-         */
-        BlobShape slice(int startAxis, int endAxis = INT_MAX) const;
-
-        /** @brief Returns pointer to the first element of continuous size array. */
-        const int *ptr() const;
-        /** @overload */
-        int *ptr();
-
-        bool equal(const BlobShape &other) const;       //!< Checks equality of two shapes.
-        bool operator== (const BlobShape &r) const;     //!< @sa equal()
-
-        BlobShape operator+ (const BlobShape &r) const; //!< Contacenates two shapes.
-
-        static BlobShape like(const Mat &m);    //!< Returns shape of passed Mat.
-        static BlobShape like(const UMat &m);   //!< Returns shape of passed UMat.
-
-        static BlobShape empty();               //!< Returns empty shape [].
-        bool isEmpty() const;                   //!< Returns true if shape is empty (i.e []).
-
-#ifdef CV_CXX_MOVE_SEMANTICS
-        //TBD
-#endif
-
-    private:
-        cv::AutoBuffer<int,4> sz;
-    };
-
-
-    /** @brief This class provides methods for continuous n-dimensional CPU and GPU array processing.
-     *
-     * The class is realized as a wrapper over @ref cv::Mat and @ref cv::UMat.
-     * It will support methods for switching and logical synchronization between CPU and GPU.
-    */
-    class CV_EXPORTS_W Blob
-    {
-    public:
-        Blob();
-
-        /** @brief Constructs blob with specified @p shape and @p type. */
-        explicit Blob(const BlobShape &shape, int type = CV_32F, int allocFlags = ALLOC_MAT);
-
-        /** @brief Constructs Blob from existing Mat or UMat. */
-        Blob(InputArray data);
-
-        /** @brief Constructs 4-dimensional blob (so-called batch) from image or array of images.
-         * @param image 2-dimensional multi-channel or 3-dimensional single-channel image (or array of such images)
-         * @param dstCn specifies size of second axis of ouptut blob
-         */
-        static Blob fromImages(InputArray image, int dstCn = -1);
-
-        /** @brief Works like Blob::fromImages() but in-place. */
-        void batchFromImages(InputArray image, int dstCn = -1);
-
-        /** @brief Creates blob with specified @p shape and @p type. */
-        void create(const BlobShape &shape, int type = CV_32F, int allocFlags = ALLOC_MAT);
-
-        /** @brief Creates blob from Mat or UMat without copying the data.
-          * @details If in is Mat then Mat data is populated, otherwise - UMat.
-          */
-        void fill(InputArray in);
-
-        /** @brief Creates blob from user data.
-         *  @details If @p deepCopy is false then CPU data will not be allocated.
-         */
-        void fill(const BlobShape &shape, int type, void *data, bool deepCopy = true);
-
-        /** @brief Sets @p value to the last used data (if @p allocFlags = -1).
-         * @details If @p allocFlags != -1 then destination data (Mat or UMat) is determined by flags from AllocFlag enum like in create().
-         */
-        void setTo(InputArray value, int allocFlags = -1);
-
-        Mat& matRef(bool writeOnly = true);     //!< Returns reference to cv::Mat, containing blob data.
-        const Mat& matRefConst() const;         //!< Returns reference to cv::Mat, containing blob data, for read-only purposes.
-        UMat &umatRef(bool writeOnly = true);   //!< Returns reference to cv::UMat, containing blob data.
-        const UMat &umatRefConst() const;       //!< Returns reference to cv::UMat, containing blob data, for read-only purposes.
-
-        template<typename XMat>
-        XMat &getRef(bool writeOnly = true);
-        template<typename XMat>
-        const XMat &getRefConst() const;
-
-        void updateMat(bool syncData = true) const;     //!< Actualizes data stored inside Mat of Blob; if @p syncData is false then only shape will be actualized.
-        void updateUMat(bool syncData = true) const;    //!< Actualizes data stored inside Mat of Blob; if @p syncData is false then only shape will be actualized.
-        void sync() const;                              //!< Updates Mat and UMat of Blob.
-
-        /** @brief Returns number of blob dimensions. */
-        int dims() const;
-
-        /** @brief Returns the size of the specified @p axis.
-         *
-         * Negative @p axis is supported, in this case a counting starts from the last axis,
-         * i. e. -1 corresponds to last axis.
-         * If non-existing axis was passed then an error will be generated.
-         */
-        int size(int axis) const;
-
-        /** @brief Returns the size of the specified @p axis.
-         *
-         * Does the same thing as size(int) const, but if non-existing axis will be passed then 1 will be returned,
-         * therefore this function always finishes successfully.
-         */
-        int xsize(int axis) const;
-
-        /** @brief Computes the product of sizes of axes among the specified axes range [@p startAxis; @p endAxis).
-         * @param startAxis the first axis to include in the range.
-         * @param endAxis   the first axis to exclude from the range.
-         * @details Negative axis indexing can be used.
-         */
-        size_t total(int startAxis = 0, int endAxis = INT_MAX) const;
-
-        /** @brief Converts @p axis index to canonical format (where 0 <= @p axis < dims()). */
-        int canonicalAxis(int axis) const;
-
-        /** @brief Returns shape of the blob. */
-        BlobShape shape() const;
-
-        /** @brief Checks equality of two blobs shapes. */
-        bool equalShape(const Blob &other) const;
-
-        /** @brief Returns slice of first two dimensions.
-         *  @details The behaviour is similar to the following numpy code: blob[n, cn, ...]
-         */
-        Mat getPlane(int n, int cn);
-
-        /** @brief Returns slice of first dimension.
-         *  @details The behaviour is similar to getPlane(), but returns all
-         * channels * rows * cols values, corresponding to the n-th value
-         * of the first dimension.
-         */
-        Mat getPlanes(int n);
-
-        /* Shape getters of 4-dimensional blobs. */
-        int cols() const;       //!< Returns size of the fourth axis blob.
-        int rows() const;       //!< Returns size of the thrid  axis blob.
-        int channels() const;   //!< Returns size of the second axis blob.
-        int num() const;        //!< Returns size of the first  axis blob.
-        Size size2() const;     //!< Returns cv::Size(cols(), rows())
-        Vec4i shape4() const;   //!< Returns shape of first four blob axes.
-
-        /** @brief Returns linear index of the element with specified coordinates in the blob.
-         *
-         * If @p n < dims() then unspecified coordinates will be filled by zeros.
-         * If @p n > dims() then extra coordinates will be ignored.
-         */
-        template<int n>
-        size_t offset(const Vec<int, n> &pos) const;
-        /** @overload */
-        size_t offset(int n = 0, int cn = 0, int row = 0, int col = 0) const;
-
-        /* CPU pointer getters */
-        /** @brief Returns pointer to the blob element with the specified position, stored in CPU memory.
-         *
-         * @p n correspond to the first axis, @p cn - to the second, etc.
-         * If dims() > 4 then unspecified coordinates will be filled by zeros.
-         * If dims() < 4 then extra coordinates will be ignored.
-         */
-        uchar *ptr(int n = 0, int cn = 0, int row = 0, int col = 0);
-        /** @overload */
-        template<typename Type>
-        Type *ptr(int n = 0, int cn = 0, int row = 0, int col = 0);
-        /** @overload ptr<float>() */
-        float *ptrf(int n = 0, int cn = 0, int row = 0, int col = 0);
-        //TODO: add const ptr methods
-
-        /** @brief Shares data from other @p blob.
-         * @returns *this
-         */
-        Blob &shareFrom(const Blob &blob);
-
-        /** @brief Changes shape of the blob without copying the data.
-         * @returns *this
-         */
-        Blob &reshape(const BlobShape &shape);
-
-        /** @brief Changes shape of the blob without copying the data.
-         * @returns shallow copy of original blob with new shape.
-         */
-        Blob reshaped(const BlobShape &newShape) const;
-
-        int type() const;       //!< Returns type of the blob.
-        int elemSize() const;   //!< Returns size of single element in bytes.
-        int getState() const;   //!< Returns current state of the blob, @see DataState.
-
-    private:
-        const int *sizes() const;
-
-#   define CV_DNN_UMAT //DBG
-#ifdef HAVE_OPENCL
-#   define CV_DNN_UMAT
-#endif
-
-#ifdef CV_DNN_UMAT
-#   define CV_DNN_UMAT_ONLY(expr) (expr)
-#else
-#   define CV_DNN_UMAT_ONLY(expr)
-#endif
-
-#ifndef CV_DNN_UMAT
-        Mat m;
-#else
-        mutable Mat m;
-        mutable UMat um;
-        mutable uchar state;
-#endif
-
-public:
-        enum DataState
-        {
-            UNINITIALIZED   = 0,
-            HEAD_AT_MAT     = 1 << 0,
-            HEAD_AT_UMAT    = 1 << 1,
-            SYNCED          = HEAD_AT_MAT | HEAD_AT_UMAT
-        };
-
-        enum AllocFlag
-        {
-            ALLOC_MAT  = HEAD_AT_MAT,
-            ALLOC_UMAT = HEAD_AT_UMAT,
-            ALLOC_BOTH = SYNCED
-        };
-    };
-
-//! @}
-}
-}
-
-#include "blob.inl.hpp"
-
-#endif
diff --git a/modules/dnn/include/opencv2/dnn/blob.inl.hpp b/modules/dnn/include/opencv2/dnn/blob.inl.hpp
deleted file mode 100644
index b7f741e3acb..00000000000
--- a/modules/dnn/include/opencv2/dnn/blob.inl.hpp
+++ /dev/null
@@ -1,533 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_DNN_BLOB_INL_HPP__
-#define __OPENCV_DNN_DNN_BLOB_INL_HPP__
-#include "blob.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-
-inline BlobShape::BlobShape()
-{
-    sz.allocate(4);
-    for (size_t i = 0; i < sz.size(); i++)
-        sz[i] = 1;
-}
-
-inline BlobShape BlobShape::all(int ndims, int fill)
-{
-    CV_Assert(ndims >= 0);
-    BlobShape res;
-    res.sz.allocate(ndims);
-    for (int i = 0; i < ndims; i++)
-        res.sz[i] = fill;
-    return res;
-}
-
-inline BlobShape::BlobShape(int ndims, const int *sizes) : sz( (size_t)std::max(ndims, 0) )
-{
-    CV_Assert(ndims >= 0);
-    if (!sizes)
-        return;
-    for (int i = 0; i < ndims; i++)
-        sz[i] = sizes[i];
-}
-
-inline BlobShape::BlobShape(int s0) : sz(1)
-{
-    sz[0] = s0;
-}
-
-inline BlobShape::BlobShape(int s0, int s1) : sz(2)
-{
-    sz[0] = s0;
-    sz[1] = s1;
-}
-
-inline BlobShape::BlobShape(int s0, int s1, int s2) : sz(3)
-{
-    sz[0] = s0;
-    sz[1] = s1;
-    sz[2] = s2;
-}
-
-inline BlobShape::BlobShape(int num, int cn, int rows, int cols) : sz(4)
-{
-    sz[0] = num;
-    sz[1] = cn;
-    sz[2] = rows;
-    sz[3] = cols;
-}
-
-inline BlobShape::BlobShape(const std::vector<int> &sizes) : sz( sizes.size() )
-{
-    for (int i = 0; i < (int)sizes.size(); i++)
-        sz[i] = sizes[i];
-}
-
-template<int n>
-inline BlobShape::BlobShape(const Vec<int, n> &shape) : sz(n)
-{
-    for (int i = 0; i < n; i++)
-        sz[i] = shape[i];
-}
-
-inline int BlobShape::dims() const
-{
-    return (int)sz.size();
-}
-
-inline int BlobShape::xsize(int axis) const
-{
-    if (axis < -dims() || axis >= dims())
-        return 1;
-
-    return sz[(axis < 0) ? axis + dims() : axis];
-}
-
-inline int BlobShape::size(int axis) const
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return sz[(axis < 0) ? axis + dims() : axis];
-}
-
-inline int &BlobShape::size(int axis)
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return sz[(axis < 0) ? axis + dims() : axis];
-}
-
-inline int BlobShape::operator[] (int axis) const
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return sz[(axis < 0) ? axis + dims() : axis];
-}
-
-inline int &BlobShape::operator[] (int axis)
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return sz[(axis < 0) ? axis + dims() : axis];
-}
-
-inline int BlobShape::canonicalAxis(int axis) const
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return (axis < 0) ? axis + dims() : axis;
-}
-
-inline ptrdiff_t BlobShape::total() const
-{
-    if (dims() == 0)
-        return 0;
-
-    ptrdiff_t res = 1;
-    for (int i = 0; i < dims(); i++)
-        res *= sz[i];
-    return res;
-}
-
-inline ptrdiff_t BlobShape::total(int startAxis, int endAxis) const
-{
-    if (isEmpty())
-        return 0;
-
-    if (endAxis == INT_MAX)
-        endAxis = dims();
-    else if (endAxis < 0)
-        endAxis += dims();
-    startAxis = (startAxis < 0) ? startAxis + dims() : startAxis;
-    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
-
-    ptrdiff_t res = 1;
-    for (int i = startAxis; i < endAxis; i++)
-        res *= sz[i];
-    return res;
-}
-
-inline BlobShape BlobShape::slice(int startAxis, int endAxis) const
-{
-    if (isEmpty())
-        return BlobShape::empty();
-
-    if (endAxis == INT_MAX)
-        endAxis = dims();
-    else if (endAxis < 0)
-        endAxis += dims();
-    startAxis = (startAxis < 0) ? startAxis + dims() : startAxis;
-    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
-
-    BlobShape res(endAxis - startAxis, (const int*)NULL);
-    for (int i = startAxis; i < endAxis; i++)
-        res[i - startAxis] = sz[i];
-    return res;
-}
-
-inline const int *BlobShape::ptr() const
-{
-    return sz;
-}
-
-inline int *BlobShape::ptr()
-{
-    return sz;
-}
-
-inline bool BlobShape::equal(const BlobShape &other) const
-{
-    if (this->dims() != other.dims())
-        return false;
-
-    for (int i = 0; i < other.dims(); i++)
-    {
-        if (sz[i] != other.sz[i])
-            return false;
-    }
-
-    return true;
-}
-
-inline bool BlobShape::operator==(const BlobShape &r) const
-{
-    return this->equal(r);
-}
-
-inline BlobShape BlobShape::like(const Mat &m)
-{
-    return BlobShape(m.dims, (const int*)m.size);
-}
-
-inline BlobShape BlobShape::like(const UMat &m)
-{
-    return BlobShape(m.dims, (const int*)m.size);
-}
-
-inline BlobShape BlobShape::empty()
-{
-    return BlobShape(0, (const int*)NULL);
-}
-
-inline bool BlobShape::isEmpty() const
-{
-    return dims() == 0;
-}
-
-inline BlobShape BlobShape::operator+(const BlobShape &r) const
-{
-    BlobShape newShape(this->dims() + r.dims(), (int*)NULL);
-    for (int i = 0; i < this->dims(); i++)
-        newShape[i] = (*this)[i];
-    for (int i = 0; i < r.dims(); i++)
-        newShape[this->dims() + i] = r[i];
-    return newShape;
-}
-
-CV_EXPORTS std::ostream &operator<< (std::ostream &stream, const BlobShape &shape);
-
-/////////////////////////////////////////////////////////////////////
-
-#ifndef CV_DNN_UMAT
-#   define CV_DNN_SWITCH_MU(cpu_expr, gpu_expr) (cpu_expr)
-#else
-#   define CV_DNN_SWITCH_MU(cpu_expr, gpu_expr) ((state == HEAD_AT_UMAT) ? (gpu_expr) : (cpu_expr))
-#endif
-
-
-inline int Blob::dims() const
-{
-    return CV_DNN_SWITCH_MU(m.dims, um.dims);
-}
-
-inline const int * Blob::sizes() const
-{
-    return CV_DNN_SWITCH_MU((const int*)m.size, (const int*)um.size);
-}
-
-inline int Blob::type() const
-{
-    return CV_DNN_SWITCH_MU(m.type(), um.type());
-}
-
-template<int n>
-inline size_t Blob::offset(const Vec<int, n> &pos) const
-{
-    const MatStep &step = CV_DNN_SWITCH_MU(m.step, um.step);
-    size_t ofs = 0;
-    int i;
-    for (i = 0; i < std::min(n, dims()); i++)
-    {
-        CV_DbgAssert(pos[i] >= 0 && pos[i] < size(i));
-        ofs += step[i] * pos[i];
-    }
-    for (; i < dims(); i++)
-        CV_DbgAssert(pos[i] == 0);
-    CV_DbgAssert(ofs % elemSize() == 0);
-    return ofs / elemSize();
-}
-
-inline int Blob::canonicalAxis(int axis) const
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return (axis < 0) ? axis + dims() : axis;
-}
-
-inline int Blob::xsize(int axis) const
-{
-    if (axis < -dims() || axis >= dims())
-        return 1;
-
-    return sizes()[(axis < 0) ? axis + dims() : axis];
-}
-
-inline int Blob::size(int axis) const
-{
-    CV_Assert(-dims() <= axis && axis < dims());
-    return sizes()[(axis < 0) ? axis + dims() : axis];
-}
-
-inline size_t Blob::total(int startAxis, int endAxis) const
-{
-    if (startAxis < 0)
-        startAxis += dims();
-
-    if (endAxis == INT_MAX)
-        endAxis = dims();
-    else if (endAxis < 0)
-        endAxis += dims();
-
-    CV_Assert(0 <= startAxis && startAxis <= endAxis && endAxis <= dims());
-
-    size_t cnt = 1; //fix: assume that slice isn't empty
-    for (int i = startAxis; i < endAxis; i++)
-        cnt *= (size_t)sizes()[i];
-
-    return cnt;
-}
-
-inline size_t Blob::offset(int n, int cn, int row, int col) const
-{
-    return offset(Vec4i(n, cn, row, col));
-}
-
-inline float *Blob::ptrf(int n, int cn, int row, int col)
-{
-    return matRef(false).ptr<float>() + offset(n, cn, row, col);
-}
-
-inline uchar *Blob::ptr(int n, int cn, int row, int col)
-{
-    Mat &mat = matRef(false);
-    return mat.ptr() + mat.elemSize() * offset(n, cn, row, col);
-}
-
-template<typename Dtype>
-inline Dtype* Blob::ptr(int n, int cn, int row, int col)
-{
-    CV_Assert(type() == cv::DataDepth<Dtype>::value);
-    return (Dtype*) ptr(n, cn, row, col);
-}
-
-inline BlobShape Blob::shape() const
-{
-    return BlobShape(dims(), sizes());
-}
-
-inline bool Blob::equalShape(const Blob &other) const
-{
-    if (this->dims() != other.dims())
-        return false;
-
-    for (int i = 0; i < dims(); i++)
-    {
-        if (this->sizes()[i] != other.sizes()[i])
-            return false;
-    }
-    return true;
-}
-
-inline Mat& Blob::matRef(bool writeOnly)
-{
-#ifdef CV_DNN_UMAT
-    updateMat(!writeOnly);
-    state = HEAD_AT_MAT;
-#else
-    (void)writeOnly;
-#endif
-    return m;
-}
-
-inline const Mat& Blob::matRefConst() const
-{
-    CV_DNN_UMAT_ONLY( updateMat() );
-    return m;
-}
-
-inline UMat &Blob::umatRef(bool writeOnly)
-{
-#ifndef CV_DNN_UMAT
-    CV_Error(Error::GpuNotSupported, "");
-    (void)writeOnly;
-    return *(new UMat());
-#else
-    updateUMat(!writeOnly);
-    state = HEAD_AT_UMAT;
-    return um;
-#endif
-}
-
-inline const UMat &Blob::umatRefConst() const
-{
-#ifndef CV_DNN_UMAT
-    CV_Error(Error::GpuNotSupported, "");
-    return *(new UMat());
-#else
-    updateUMat();
-    return um;
-#endif
-}
-
-template<>
-inline Mat &Blob::getRef<Mat>(bool writeOnly)
-{
-    return matRef(writeOnly);
-}
-
-template<>
-inline UMat &Blob::getRef<UMat>(bool writeOnly)
-{
-    return umatRef(writeOnly);
-}
-
-template<>
-inline const Mat &Blob::getRefConst<Mat>() const
-{
-    return matRefConst();
-}
-
-template<>
-inline const UMat &Blob::getRefConst<UMat>() const
-{
-    return umatRefConst();
-}
-
-inline Mat Blob::getPlane(int n, int cn)
-{
-    CV_Assert(dims() > 2);
-    return Mat(dims() - 2, sizes() + 2, type(), ptr(n, cn));
-}
-
-inline Mat Blob::getPlanes(int n)
-{
-    CV_Assert(dims() > 3);
-    return Mat(dims() - 1, sizes() + 1, type(), ptr(n));
-}
-
-inline int Blob::cols() const
-{
-    return xsize(3);
-}
-
-inline int Blob::rows() const
-{
-    return xsize(2);
-}
-
-inline int Blob::channels() const
-{
-    return xsize(1);
-}
-
-inline int Blob::num() const
-{
-    return xsize(0);
-}
-
-inline Size Blob::size2() const
-{
-    return Size(cols(), rows());
-}
-
-inline Blob &Blob::shareFrom(const Blob &blob)
-{
-    this->m = blob.m;
-#ifdef CV_DNN_UMAT
-    this->um = blob.um;
-    this->state = blob.state;
-#endif
-    return *this;
-}
-
-inline Blob &Blob::reshape(const BlobShape &newShape)
-{
-    if (!m.empty()) m = m.reshape(1, newShape.dims(), newShape.ptr());
-#ifdef CV_DNN_UMAT
-    if (!um.empty()) um = um.reshape(1, newShape.dims(), newShape.ptr());
-#endif
-    return *this;
-}
-
-inline Blob Blob::reshaped(const BlobShape &newShape) const
-{
-    Blob res(*this); //also, res.shareFrom(*this) could be used
-    res.reshape(newShape);
-    return res;
-}
-
-inline int Blob::elemSize() const
-{
-    return CV_ELEM_SIZE(type());
-}
-
-inline int Blob::getState() const
-{
-#ifdef CV_DNN_UMAT
-    return this->state;
-#else
-    return m.empty() ? UNINITIALIZED : HEAD_AT_MAT;
-#endif
-}
-
-}
-}
-
-#endif
diff --git a/modules/dnn/include/opencv2/dnn/dict.hpp b/modules/dnn/include/opencv2/dnn/dict.hpp
index f7cd0f21150..1096cc0ff26 100644
--- a/modules/dnn/include/opencv2/dnn/dict.hpp
+++ b/modules/dnn/include/opencv2/dnn/dict.hpp
@@ -118,6 +118,9 @@ class CV_EXPORTS Dict
     //! If the @p key in the dictionary then returns pointer to its value, else returns NULL.
     DictValue *ptr(const String &key);
 
+    /** @overload */
+    const DictValue *ptr(const String &key) const;
+
     //! If the @p key in the dictionary then returns its value, else an error will be generated.
     const DictValue &get(const String &key) const;
 
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp
index cdfdfe96627..d2440d5c6f6 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -45,7 +45,6 @@
 #include <vector>
 #include <opencv2/core.hpp>
 #include <opencv2/dnn/dict.hpp>
-#include <opencv2/dnn/blob.hpp>
 
 namespace cv
 {
@@ -70,7 +69,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     {
     public:
         //TODO: Add ability to name blob params
-        std::vector<Blob> blobs; //!< List of learned parameters stored as blobs.
+        std::vector<Mat> blobs; //!< List of learned parameters stored as blobs.
 
         String name; //!< Name of the layer instance (optional, can be used internal purposes).
         String type; //!< Type name which was used for creating layer by layer factory (optional).
@@ -86,7 +85,7 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     public:
 
         //! List of learned parameters must be stored here to allow read them by using Net::getParam().
-        CV_PROP_RW std::vector<Blob> blobs;
+        CV_PROP_RW std::vector<Mat> blobs;
 
         /** @brief Allocates internal buffers and output blobs with respect to the shape of inputs.
          *  @param[in]  input  vector of already allocated input blobs
@@ -96,25 +95,25 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
          * If this method is called first time then @p output vector consists from empty blobs and its size determined by number of output connections.
          * This method can be called multiple times if size of any @p input blob was changed.
          */
-        virtual void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output) = 0;
+        virtual void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output) = 0;
 
         /** @brief Given the @p input blobs, computes the output @p blobs.
          *  @param[in]  input  the input blobs.
          *  @param[out] output allocated output blobs, which will store results of the computation.
          */
-        virtual void forward(std::vector<Blob*> &input, std::vector<Blob> &output) = 0;
+        virtual void forward(std::vector<Mat*> &input, std::vector<Mat> &output) = 0;
 
         /** @brief @overload */
-        CV_WRAP void allocate(const std::vector<Blob> &inputs, CV_OUT std::vector<Blob> &outputs);
+        CV_WRAP void allocate(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
 
         /** @brief @overload */
-        CV_WRAP std::vector<Blob> allocate(const std::vector<Blob> &inputs);
+        CV_WRAP std::vector<Mat> allocate(const std::vector<Mat> &inputs);
 
         /** @brief @overload */
-        CV_WRAP void forward(const std::vector<Blob> &inputs, CV_IN_OUT std::vector<Blob> &outputs);
+        CV_WRAP void forward(const std::vector<Mat> &inputs, CV_IN_OUT std::vector<Mat> &outputs);
 
         /** @brief Allocates layer and computes output. */
-        CV_WRAP void run(const std::vector<Blob> &inputs, CV_OUT std::vector<Blob> &outputs);
+        CV_WRAP void run(const std::vector<Mat> &inputs, CV_OUT std::vector<Mat> &outputs);
 
         /** @brief Returns index of input blob into the input array.
          *  @param inputName label of input blob
@@ -248,13 +247,13 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
          *  @note If updating blob is not empty then @p blob must have the same shape,
          *  because network reshaping is not implemented yet.
          */
-        CV_WRAP void setBlob(String outputName, const Blob &blob);
+        CV_WRAP void setBlob(String outputName, const Mat &blob);
 
         /** @brief Returns the layer output blob.
          *  @param outputName the descriptor of the returning layer output blob.
          *  @see connect(String, String)
          */
-        CV_WRAP Blob getBlob(String outputName);
+        CV_WRAP Mat getBlob(String outputName);
 
         /** @brief Sets the new value for the learned param of the layer.
          *  @param layer name or id of the layer.
@@ -264,14 +263,14 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
          *  @note If shape of the new blob differs from the previous shape,
          *  then the following forward pass may fail.
         */
-        CV_WRAP void setParam(LayerId layer, int numParam, const Blob &blob);
+        CV_WRAP void setParam(LayerId layer, int numParam, const Mat &blob);
 
         /** @brief Returns parameter blob of the layer.
          *  @param layer name or id of the layer.
          *  @param numParam index of the layer parameter in the Layer::blobs array.
          *  @see Layer::blobs
          */
-        CV_WRAP Blob getParam(LayerId layer, int numParam = 0);
+        CV_WRAP Mat getParam(LayerId layer, int numParam = 0);
 
         /** @brief Returns indexes of layers with unconnected outputs.
          */
@@ -341,7 +340,10 @@ namespace dnn //! This namespace is used for dnn module functionlaity.
     /** @brief Loads blob which was serialized as torch.Tensor object of Torch7 framework.
      *  @warning This function has the same limitations as createTorchImporter().
      */
-    CV_EXPORTS_W Blob readTorchBlob(const String &filename, bool isBinary = true);
+    CV_EXPORTS_W Mat readTorchBlob(const String &filename, bool isBinary = true);
+
+    CV_EXPORTS Mat blobFromImage(const Mat& image, double scalefactor=1.0, bool swapRB=true);
+    CV_EXPORTS Mat blobFromImages(const std::vector<Mat>& image, double scalefactor=1.0, bool swapRB=true);
 
 //! @}
 }
diff --git a/modules/dnn/include/opencv2/dnn/dnn.inl.hpp b/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
index a272044025e..8a3c72ee983 100644
--- a/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.inl.hpp
@@ -298,6 +298,12 @@ inline DictValue *Dict::ptr(const String &key)
     return (i == dict.end()) ? NULL : &i->second;
 }
 
+inline const DictValue *Dict::ptr(const String &key) const
+{
+    _Dict::const_iterator i = dict.find(key);
+    return (i == dict.end()) ? NULL : &i->second;
+}
+
 inline const DictValue &Dict::get(const String &key) const
 {
     _Dict::const_iterator i = dict.find(key);
diff --git a/modules/dnn/include/opencv2/dnn/layer.hpp b/modules/dnn/include/opencv2/dnn/layer.hpp
index e0510411b11..af663dd3e1e 100644
--- a/modules/dnn/include/opencv2/dnn/layer.hpp
+++ b/modules/dnn/include/opencv2/dnn/layer.hpp
@@ -122,7 +122,7 @@ static _LayerStaticRegisterer __LayerStaticRegisterer_##type(#type, __LayerStati
 template<typename LayerClass>
 Ptr<Layer> _layerDynamicRegisterer(LayerParams &params)
 {
-    return Ptr<Layer>(new LayerClass(params));
+    return Ptr<Layer>(LayerClass::create(params));
 }
 
 //allows automatically register created layer on module load time
diff --git a/modules/dnn/include/opencv2/dnn/shape_utils.hpp b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
index f52e5b9cbc1..6d2d7d65562 100644
--- a/modules/dnn/include/opencv2/dnn/shape_utils.hpp
+++ b/modules/dnn/include/opencv2/dnn/shape_utils.hpp
@@ -43,14 +43,13 @@
 #define __OPENCV_DNN_DNN_SHAPE_UTILS_HPP__
 
 #include <opencv2/core.hpp>
+#include <opencv2/core/types_c.h>
 #include <ostream>
 
 namespace cv {
 namespace dnn {
 
 //Useful shortcut
-typedef BlobShape Shape;
-
 inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
 {
     return s << "[" << r.start << ", " << r.end << ")";
@@ -59,7 +58,7 @@ inline std::ostream &operator<< (std::ostream &s, cv::Range &r)
 //Reshaping
 //TODO: add -1 specifier for automatic size inferring
 
-template<typename Mat>
+/*template<typename Mat>
 void reshape(Mat &m, const BlobShape &shape)
 {
     m = m.reshape(1, shape.dims(), shape.ptr());
@@ -69,7 +68,7 @@ template<typename Mat>
 Mat reshaped(const Mat &m, const BlobShape &shape)
 {
     return m.reshape(1, shape.dims(), shape.ptr());
-}
+}*/
 
 
 //Slicing
@@ -80,22 +79,19 @@ struct _Range : public cv::Range
     _Range(int start, int size = 1) : cv::Range(start, start + size) {}
 };
 
-template<typename Mat>
-Mat slice(const Mat &m, const _Range &r0)
+static inline Mat slice(const Mat &m, const _Range &r0)
 {
-    //CV_Assert(m.dims >= 1);
-    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    Range ranges[CV_MAX_DIM];
     for (int i = 1; i < m.dims; i++)
         ranges[i] = Range::all();
     ranges[0] = r0;
     return m(&ranges[0]);
 }
 
-template<typename Mat>
-Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
 {
     CV_Assert(m.dims >= 2);
-    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    Range ranges[CV_MAX_DIM];
     for (int i = 2; i < m.dims; i++)
         ranges[i] = Range::all();
     ranges[0] = r0;
@@ -103,11 +99,10 @@ Mat slice(const Mat &m, const _Range &r0, const _Range &r1)
     return m(&ranges[0]);
 }
 
-template<typename Mat>
-Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
 {
-    CV_Assert(m.dims <= 3);
-    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    CV_Assert(m.dims >= 3);
+    Range ranges[CV_MAX_DIM];
     for (int i = 3; i < m.dims; i++)
         ranges[i] = Range::all();
     ranges[0] = r0;
@@ -116,11 +111,10 @@ Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2)
     return m(&ranges[0]);
 }
 
-template<typename Mat>
-Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
+static inline Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, const _Range &r3)
 {
-    CV_Assert(m.dims <= 4);
-    cv::AutoBuffer<cv::Range, 4> ranges(m.dims);
+    CV_Assert(m.dims >= 4);
+    Range ranges[CV_MAX_DIM];
     for (int i = 4; i < m.dims; i++)
         ranges[i] = Range::all();
     ranges[0] = r0;
@@ -130,7 +124,28 @@ Mat slice(const Mat &m, const _Range &r0, const _Range &r1, const _Range &r2, co
     return m(&ranges[0]);
 }
 
-BlobShape computeShapeByReshapeMask(const BlobShape &srcShape, const BlobShape &maskShape, Range srcRange = Range::all());
+static inline Mat getPlane(const Mat &m, int n, int cn)
+{
+    CV_Assert(m.dims > 2);
+    Range range[CV_MAX_DIM];
+    int sz[CV_MAX_DIM];
+    for(int i = 2; i < m.dims; i++)
+    {
+        sz[i-2] = m.size.p[i];
+        range[i] = Range::all();
+    }
+    range[0] = Range(n, n+1);
+    range[1] = Range(cn, cn+1);
+    return m(range).reshape(1, m.dims-2, sz);
+}
+
+static inline size_t shapeTotal(const std::vector<int>& shape)
+{
+    size_t i, n = shape.size(), p = 1;
+    for( i = 0; i < n; i++ ) p *= shape[i];
+
+    return p;
+}
 
 }
 }
diff --git a/modules/dnn/misc/python/pyopencv_dnn.hpp b/modules/dnn/misc/python/pyopencv_dnn.hpp
index 06661de123a..40ac504f241 100644
--- a/modules/dnn/misc/python/pyopencv_dnn.hpp
+++ b/modules/dnn/misc/python/pyopencv_dnn.hpp
@@ -1,66 +1,5 @@
 #ifdef HAVE_OPENCV_DNN
 typedef dnn::DictValue LayerId;
-typedef std::vector<cv::dnn::Blob> vector_Blob;
-
-template<>
-bool pyopencv_to(PyObject *o, dnn::Blob &blob, const char *name);
-
-template<> struct pyopencvVecConverter<dnn::Blob>
-{
-    static bool to(PyObject* obj, std::vector<dnn::Blob>& value, const ArgInfo info)
-    {
-        if (PyArray_Check(obj))
-        {
-            value.resize(1);
-            return pyopencv_to(obj, value[0], info.name);
-        }
-
-        return pyopencv_to_generic_vec(obj, value, info);
-    }
-
-    static PyObject* from(const std::vector<dnn::Blob>& value)
-    {
-        return pyopencv_from_generic_vec(value);
-    }
-};
-
-template<>
-bool pyopencv_to(PyObject *o, std::vector<dnn::Blob> &blobs, const char *name) //required for Layer::blobs RW
-{
-    return pyopencvVecConverter<dnn::Blob>::to(o, blobs, ArgInfo(name, false));
-}
-
-template<>
-bool pyopencv_to(PyObject *o, dnn::Blob &blob, const char *name)
-{
-    Mat &dst = blob.matRef();
-    if (!pyopencv_to(o, dst, name))
-        return false;
-
-    if (PyArray_Check(o)) //try fix channels
-    {
-        PyArrayObject* oarr = (PyArrayObject*) o;
-
-        if (PyArray_NDIM(oarr) == dst.dims)
-            return true;
-
-        int ndims = PyArray_NDIM(oarr);
-        std::vector<int> shape(ndims);
-        const npy_intp* _sizes = PyArray_DIMS(oarr);
-        for (int i = 0; i < ndims; i++)
-            shape[i] = (int)_sizes[i];
-
-        dst = dst.reshape(1, ndims, &shape[0]);
-    }
-
-    return true;
-}
-
-template<>
-PyObject *pyopencv_from(const dnn::Blob &blob)
-{
-    return pyopencv_from(blob.matRefConst());
-}
 
 template<>
 bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
@@ -87,22 +26,4 @@ bool pyopencv_to(PyObject *o, dnn::DictValue &dv, const char *name)
         return false;
 }
 
-template<>
-bool pyopencv_to(PyObject *o, dnn::BlobShape &shape, const char *name)
-{
-    std::vector<int> data;
-    if (!pyopencv_to_generic_vec(o, data, ArgInfo(name, false)))
-        return false;
-
-    shape = data.size() ? dnn::BlobShape((int)data.size(), &data[0]) : dnn::BlobShape::empty();
-    return true;
-}
-
-template<>
-PyObject *pyopencv_from(const dnn::BlobShape &shape)
-{
-    std::vector<int> data(shape.ptr(), shape.ptr() + shape.dims());
-    return pyopencv_from_generic_vec(data);
-}
-
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/dnn/perf/perf_convolution.cpp b/modules/dnn/perf/perf_convolution.cpp
index 17fda01592c..af37134300e 100644
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
@@ -21,15 +21,21 @@ CV_ENUM(GroupSize, GROUP_OFF, GROUP_2);
 //Squared Size
 #define SSZ(n) cv::Size(n, n)
 
-typedef std::pair<BlobShape, int> InpShapeNumOut;
+typedef std::pair<std::vector<int>, int> InpShapeNumOut;
 typedef tuple<Size, InpShapeNumOut, GroupSize, StrideSize> ConvParam; //kernel_size, inp shape, groups, stride
 typedef TestBaseWithParam<ConvParam> ConvolutionPerfTest;
 
+static inline std::vector<int> blobShape(int count, int nplanes, int height, int width)
+{
+    int data[] = {count, nplanes, height, width};
+    return std::vector<int>(data, data+4);
+}
+
 PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
     Values(Size(1, 1), Size(3, 3), Size(5, 5), Size(11, 11)),
-    Values(make_pair(BlobShape(1,   4, 224, 224),  64),
-           make_pair(BlobShape(1,  64, 112, 122), 128),
-           make_pair(BlobShape(1, 256,  28,  28), 512)),
+    Values(make_pair(blobShape(1,   4, 224, 224),  64),
+           make_pair(blobShape(1,  64, 112, 122), 128),
+           make_pair(blobShape(1, 256,  28,  28), 512)),
     GroupSize::all(),
     StrideSize::all())
 )
@@ -38,17 +44,20 @@ PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
 
     ConvParam params = GetParam();
     int ksz     = get<0>(params).width;
-    BlobShape inpShape = get<1>(params).first;
+    std::vector<int> inpShape = get<1>(params).first;
     int outCn   = get<1>(params).second;
     int groups  = get<2>(params);
     int stride  = (ksz >= 11) ? 4 : (int)get<3>(params);
 
     int inpCn = inpShape[1];
-    Blob wgtBlob(BlobShape(outCn, inpCn/groups, ksz, ksz)), biasBlob(BlobShape(outCn, 1, 1, 1));
-    Blob inpBlob(inpShape);
-    rng.fill(biasBlob.matRef(), RNG::UNIFORM, -1, +1);
-    rng.fill(wgtBlob.matRef(), RNG::UNIFORM, -1, +1);
-    rng.fill(inpBlob.matRef(), RNG::UNIFORM, -1, +1);
+    int wgtSize[] = { outCn, inpCn/groups, ksz, ksz };
+    int biasSize[] = { outCn, 1, 1, 1 };
+    const int wtype = CV_32F;
+    Mat wgtBlob(4, wgtSize, wtype), biasBlob(4, biasSize, wtype);
+    Mat inpBlob(4, &inpShape[0], wtype);
+    rng.fill(biasBlob, RNG::UNIFORM, -1, +1);
+    rng.fill(wgtBlob, RNG::UNIFORM, -1, +1);
+    rng.fill(inpBlob, RNG::UNIFORM, -1, +1);
 
     LayerParams lp;
     lp.set("num_output", outCn);
@@ -59,15 +68,18 @@ PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
     lp.blobs.push_back(wgtBlob);
     lp.blobs.push_back(biasBlob);
 
-    std::vector<Blob*> inpBlobs(1, &inpBlob);
-    std::vector<Blob> outBlobs;
+    std::vector<Mat*> inpBlobs(1, &inpBlob);
+    std::vector<Mat> outBlobs;
 
     cv::setNumThreads(cv::getNumberOfCPUs());
 
     Ptr<Layer> layer = cv::dnn::LayerFactory::createLayerInstance("Convolution", lp);
     layer->allocate(inpBlobs, outBlobs);
 
-    declare.in(inpBlob.matRef(), wgtBlob.matRef(), WARMUP_RNG).out(outBlobs[0].matRef()).tbb_threads(cv::getNumThreads());
+    Mat inpBlob2D = inpBlob.reshape(1, outCn);
+    Mat wgtBlob2D = wgtBlob.reshape(1, outCn*(inpCn/groups));
+    Mat outBlob2D = outBlobs[0].reshape(1, outBlobs[0].size[0]);
+    declare.in(inpBlob2D, wgtBlob2D, WARMUP_RNG).out(outBlob2D).tbb_threads(cv::getNumThreads());
 
     TEST_CYCLE_N(10)
     {
@@ -77,4 +89,4 @@ PERF_TEST_P( ConvolutionPerfTest, perf, Combine(
     SANITY_CHECK_NOTHING();
 }
 
-}
\ No newline at end of file
+}
diff --git a/modules/dnn/samples/caffe_googlenet.cpp b/modules/dnn/samples/caffe_googlenet.cpp
index 0655a817f0f..73d5c2c6731 100644
--- a/modules/dnn/samples/caffe_googlenet.cpp
+++ b/modules/dnn/samples/caffe_googlenet.cpp
@@ -50,9 +50,9 @@ using namespace cv::dnn;
 using namespace std;
 
 /* Find best class for the blob (i. e. class with maximal probability) */
-void getMaxClass(dnn::Blob &probBlob, int *classId, double *classProb)
+void getMaxClass(const Mat &probBlob, int *classId, double *classProb)
 {
-    Mat probMat = probBlob.matRefConst().reshape(1, 1); //reshape the blob to 1x1000 matrix
+    Mat probMat = probBlob.reshape(1, 1); //reshape the blob to 1x1000 matrix
     Point classNumber;
 
     minMaxLoc(probMat, NULL, classProb, NULL, &classNumber);
@@ -115,8 +115,7 @@ int main(int argc, char **argv)
     }
 
     resize(img, img, Size(224, 224));                   //GoogLeNet accepts only 224x224 RGB-images
-    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
-    dnn::Blob inputBlob = dnn::Blob::fromImages(img);   //Convert Mat to dnn::Blob batch of images
+    Mat inputBlob = blobFromImage(img);   //Convert Mat to batch of images
     //! [Prepare blob]
 
     //! [Set input blob]
@@ -128,7 +127,7 @@ int main(int argc, char **argv)
     //! [Make forward pass]
 
     //! [Gather output]
-    dnn::Blob prob = net.getBlob("prob");   //gather output of "prob" layer
+    Mat prob = net.getBlob("prob");   //gather output of "prob" layer
 
     int classId;
     double classProb;
diff --git a/modules/dnn/samples/fcn_semsegm.cpp b/modules/dnn/samples/fcn_semsegm.cpp
index bdeb75cd2f7..58d38d65321 100755
--- a/modules/dnn/samples/fcn_semsegm.cpp
+++ b/modules/dnn/samples/fcn_semsegm.cpp
@@ -1,7 +1,6 @@
 #include <opencv2/dnn.hpp>
 #include <opencv2/imgproc.hpp>
 #include <opencv2/highgui.hpp>
-#include <opencv2/core/ocl.hpp>
 using namespace cv;
 using namespace cv::dnn;
 
@@ -45,11 +44,11 @@ static vector<cv::Vec3b> readColors(const string &filename = "pascal-classes.txt
     return colors;
 }
 
-static void colorizeSegmentation(dnn::Blob &score, const vector<cv::Vec3b> &colors, cv::Mat &segm)
+static void colorizeSegmentation(const Mat &score, const vector<cv::Vec3b> &colors, cv::Mat &segm)
 {
-    const int rows = score.rows();
-    const int cols = score.cols();
-    const int chns = score.channels();
+    const int rows = score.size[2];
+    const int cols = score.size[3];
+    const int chns = score.size[1];
 
     cv::Mat maxCl(rows, cols, CV_8UC1);
     cv::Mat maxVal(rows, cols, CV_32FC1);
@@ -57,7 +56,7 @@ static void colorizeSegmentation(dnn::Blob &score, const vector<cv::Vec3b> &colo
     {
         for (int row = 0; row < rows; row++)
         {
-            const float *ptrScore = score.ptrf(0, ch, row);
+            const float *ptrScore = score.ptr<float>(0, ch, row);
             uchar *ptrMaxCl = maxCl.ptr<uchar>(row);
             float *ptrMaxVal = maxVal.ptr<float>(row);
             for (int col = 0; col < cols; col++)
@@ -87,7 +86,6 @@ static void colorizeSegmentation(dnn::Blob &score, const vector<cv::Vec3b> &colo
 int main(int argc, char **argv)
 {
     cv::dnn::initModule();          //Required if OpenCV is built as static libs
-    cv::ocl::setUseOpenCL(false);   //OpenCL switcher
 
     String modelTxt = fcnType + "-heavy-pascal.prototxt";
     String modelBin = fcnType + "-heavy-pascal.caffemodel";
@@ -132,7 +130,7 @@ int main(int argc, char **argv)
     }
 
     resize(img, img, Size(500, 500));       //FCN accepts 500x500 RGB-images
-    dnn::Blob inputBlob = dnn::Blob::fromImages(img);   //Convert Mat to dnn::Blob batch of images
+    Mat inputBlob = blobFromImage(img);   //Convert Mat to batch of images
     //! [Prepare blob]
 
     //! [Set input blob]
@@ -147,13 +145,13 @@ int main(int argc, char **argv)
     //! [Make forward pass]
 
     //! [Gather output]
-    dnn::Blob score = net.getBlob("score");
+    Mat score = net.getBlob("score");
 
-    cv::Mat colorize;
+    Mat colorize;
     colorizeSegmentation(score, colors, colorize);
-    cv::Mat show;
-    cv::addWeighted(img, 0.4, colorize, 0.6, 0.0, show);
-    cv::imshow("show", show);
-    cv::waitKey(0);
+    Mat show;
+    addWeighted(img, 0.4, colorize, 0.6, 0.0, show);
+    imshow("show", show);
+    waitKey(0);
     return 0;
 } //main
diff --git a/modules/dnn/samples/ssd_object_detection.cpp b/modules/dnn/samples/ssd_object_detection.cpp
index ec01d8f6cf6..4f9e6df732d 100644
--- a/modules/dnn/samples/ssd_object_detection.cpp
+++ b/modules/dnn/samples/ssd_object_detection.cpp
@@ -101,7 +101,7 @@ int main(int argc, char** argv)
     //! [Prepare blob]
     Mat preprocessedFrame = preprocess(frame);
 
-    dnn::Blob inputBlob = dnn::Blob::fromImages(preprocessedFrame); //Convert Mat to dnn::Blob image
+    Mat inputBlob = blobFromImage(preprocessedFrame); //Convert Mat to batch of images
     //! [Prepare blob]
 
     //! [Set input blob]
@@ -113,8 +113,8 @@ int main(int argc, char** argv)
     //! [Make forward pass]
 
     //! [Gather output]
-    dnn::Blob detection = net.getBlob("detection_out");
-    Mat detectionMat(detection.rows(), detection.cols(), CV_32F, detection.ptrf());
+    Mat detection = net.getBlob("detection_out");
+    Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
 
     float confidenceThreshold = parser.get<float>("min_confidence");
     for(int i = 0; i < detectionMat.rows; i++)
diff --git a/modules/dnn/samples/tf_inception.cpp b/modules/dnn/samples/tf_inception.cpp
index e3b6e9cb38c..b6c03170771 100644
--- a/modules/dnn/samples/tf_inception.cpp
+++ b/modules/dnn/samples/tf_inception.cpp
@@ -32,7 +32,7 @@ const String keys =
         "{result r  || path to save output blob (optional, binary format, NCHW order) }"
         ;
 
-void getMaxClass(dnn::Blob &probBlob, int *classId, double *classProb);
+void getMaxClass(const Mat &probBlob, int *classId, double *classProb);
 std::vector<String> readClassNames(const char *filename);
 
 int main(int argc, char **argv)
@@ -97,9 +97,7 @@ int main(int argc, char **argv)
     if (inputImgSize != img.size())
         resize(img, img, inputImgSize);       //Resize image to input size
 
-    cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
-
-    dnn::Blob inputBlob = dnn::Blob::fromImages(img);   //Convert Mat to dnn::Blob image batch
+    Mat inputBlob = blobFromImage(img);   //Convert Mat to image batch
     //! [Prepare blob]
 
     //! [Set input blob]
@@ -116,11 +114,7 @@ int main(int argc, char **argv)
     tm.stop();
 
     //! [Gather output]
-    dnn::Blob prob = net.getBlob(outBlobName);   //gather output of "prob" layer
-
-    Mat& result = prob.matRef();
-
-    BlobShape shape = prob.shape();
+    Mat result = net.getBlob(outBlobName);   //gather output of "prob" layer
 
     if (!resultFile.empty()) {
         CV_Assert(result.isContinuous());
@@ -130,7 +124,7 @@ int main(int argc, char **argv)
         fout.close();
     }
 
-    std::cout << "Output blob shape " << shape  << std::endl;
+    std::cout << "Output blob shape " << result.size[0] << " x " << result.size[1] << " x " << result.size[2] << " x " << result.size[3] << std::endl;
     std::cout << "Inference time, ms: " << tm.getTimeMilli()  << std::endl;
 
     if (!classNamesFile.empty()) {
@@ -138,7 +132,7 @@ int main(int argc, char **argv)
 
         int classId;
         double classProb;
-        getMaxClass(prob, &classId, &classProb);//find the best class
+        getMaxClass(result, &classId, &classProb);//find the best class
 
         //! [Print results]
         std::cout << "Best class: #" << classId << " '" << classNames.at(classId) << "'" << std::endl;
@@ -149,9 +143,9 @@ int main(int argc, char **argv)
 
 
 /* Find best class for the blob (i. e. class with maximal probability) */
-void getMaxClass(dnn::Blob &probBlob, int *classId, double *classProb)
+void getMaxClass(const Mat &probBlob, int *classId, double *classProb)
 {
-    Mat probMat = probBlob.matRefConst().reshape(1, 1); //reshape the blob to 1x1000 matrix
+    Mat probMat = probBlob.reshape(1, 1); //reshape the blob to 1x1000 matrix
     Point classNumber;
 
     minMaxLoc(probMat, NULL, classProb, NULL, &classNumber);
diff --git a/modules/dnn/samples/torch_enet.cpp b/modules/dnn/samples/torch_enet.cpp
index bf16eff7308..feb276cb43c 100644
--- a/modules/dnn/samples/torch_enet.cpp
+++ b/modules/dnn/samples/torch_enet.cpp
@@ -27,12 +27,12 @@ const String keys =
         ;
 
 std::vector<String> readClassNames(const char *filename);
-static void colorizeSegmentation(Blob &score, Mat &segm,
+static void colorizeSegmentation(const Mat &score, Mat &segm,
                                  Mat &legend, vector<String> &classNames);
 
 int main(int argc, char **argv)
 {
-    cv::CommandLineParser parser(argc, argv, keys);
+    CommandLineParser parser(argc, argv, keys);
 
     if (parser.has("help"))
     {
@@ -78,31 +78,27 @@ int main(int argc, char **argv)
     //! [Initialize network]
 
     //! [Prepare blob]
-    Mat img = imread(imageFile), input;
+    Mat img = imread(imageFile, 1);
+
     if (img.empty())
     {
         std::cerr << "Can't read image from the file: " << imageFile << std::endl;
         exit(-1);
     }
 
-    cv::Size inputImgSize = cv::Size(512, 512);
+    Size inputImgSize(512, 512);
 
     if (inputImgSize != img.size())
         resize(img, img, inputImgSize);       //Resize image to input size
 
-    if(img.channels() == 3)
-        cv::cvtColor(img, input, cv::COLOR_BGR2RGB);
-
-    input.convertTo(input, CV_32F, 1/255.0);
-
-    dnn::Blob inputBlob = dnn::Blob::fromImages(input);   //Convert Mat to dnn::Blob image batch
+    Mat inputBlob = blobFromImage(img, 1./255, true);   //Convert Mat to image batch
     //! [Prepare blob]
 
     //! [Set input blob]
     net.setBlob("", inputBlob);        //set the network input
     //! [Set input blob]
 
-    cv::TickMeter tm;
+    TickMeter tm;
     tm.start();
 
     //! [Make forward pass]
@@ -119,11 +115,7 @@ int main(int argc, char **argv)
         oBlob = parser.get<String>("o_blob");
     }
 
-    dnn::Blob prob = net.getBlob(oBlob);   //gather output of "prob" layer
-
-    Mat& result = prob.matRef();
-
-    BlobShape shape = prob.shape();
+    Mat result = net.getBlob(oBlob);   //gather output of "prob" layer
 
     if (!resultFile.empty()) {
         CV_Assert(result.isContinuous());
@@ -133,20 +125,21 @@ int main(int argc, char **argv)
         fout.close();
     }
 
-    std::cout << "Output blob shape " << shape  << std::endl;
+    std::cout << "Output blob: " << result.size[0] << " x " << result.size[1] << " x " << result.size[2] << " x " << result.size[3] << "\n";
     std::cout << "Inference time, ms: " << tm.getTimeMilli()  << std::endl;
 
     if (parser.has("show"))
     {
+        size_t nclasses = result.size[1];
         std::vector<String> classNames;
         if(!classNamesFile.empty()) {
             classNames = readClassNames(classNamesFile.c_str());
-            if (classNames.size() > prob.channels())
-                classNames = std::vector<String>(classNames.begin() + classNames.size() - prob.channels(),
+            if (classNames.size() > nclasses)
+                classNames = std::vector<String>(classNames.begin() + classNames.size() - nclasses,
                                                  classNames.end());
         }
         Mat segm, legend;
-        colorizeSegmentation(prob, segm, legend, classNames);
+        colorizeSegmentation(result, segm, legend, classNames);
 
         Mat show;
         addWeighted(img, 0.2, segm, 0.8, 0.0, show);
@@ -184,11 +177,11 @@ std::vector<String> readClassNames(const char *filename)
     return classNames;
 }
 
-static void colorizeSegmentation(Blob &score, Mat &segm, Mat &legend, vector<String> &classNames)
+static void colorizeSegmentation(const Mat &score, Mat &segm, Mat &legend, vector<String> &classNames)
 {
-    const int rows = score.rows();
-    const int cols = score.cols();
-    const int chns = score.channels();
+    const int rows = score.size[2];
+    const int cols = score.size[3];
+    const int chns = score.size[1];
 
     vector<Vec3i> colors;
     RNG rng(12345678);
@@ -200,7 +193,7 @@ static void colorizeSegmentation(Blob &score, Mat &segm, Mat &legend, vector<Str
         colors.push_back(Vec3i(rng.uniform(0, 256), rng.uniform(0, 256), rng.uniform(0, 256)));
         for (int row = 0; row < rows; row++)
         {
-            const float *ptrScore = score.ptrf(0, ch, row);
+            const float *ptrScore = score.ptr<float>(0, ch, row);
             uchar *ptrMaxCl = maxCl.ptr<uchar>(row);
             float *ptrMaxVal = maxVal.ptr<float>(row);
             for (int col = 0; col < cols; col++)
diff --git a/modules/dnn/src/blob.cpp b/modules/dnn/src/blob.cpp
deleted file mode 100644
index 9dc0d970407..00000000000
--- a/modules/dnn/src/blob.cpp
+++ /dev/null
@@ -1,421 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#include "precomp.hpp"
-#include <opencv2/dnn/shape_utils.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-Blob::Blob()
-{
-    CV_DNN_UMAT_ONLY(state = UNINITIALIZED);
-}
-
-Blob::Blob(const BlobShape &shape, int type, int allocFlags)
-{
-    CV_DNN_UMAT_ONLY(state = UNINITIALIZED);
-    this->create(shape, type, allocFlags);
-}
-
-Blob::Blob(InputArray data)
-{
-#ifndef CV_DNN_UMAT
-    m = data.getMat();
-#else
-    if (data.isUMat())
-    {
-        um = data.getUMat();
-        state = HEAD_AT_UMAT;
-    }
-    else
-    {
-        m = data.getMat();
-        state = HEAD_AT_MAT;
-    }
-#endif
-}
-
-void Blob::create(const BlobShape &shape, int type, int allocFlags)
-{
-#ifndef CV_DNN_UMAT
-    CV_Assert(allocFlags & ALLOC_MAT);
-    m.create(shape.dims(), shape.ptr(), type);
-#else
-    CV_Assert(allocFlags & ALLOC_MAT || allocFlags & ALLOC_UMAT);
-
-    if (allocFlags & ALLOC_MAT)
-        m.create(shape.dims(), shape.ptr(), type);
-    if (allocFlags & ALLOC_UMAT)
-        um.create(shape.dims(), shape.ptr(), type);
-
-    if (state == UNINITIALIZED)
-    {
-        if (allocFlags & ALLOC_MAT && allocFlags & ALLOC_UMAT)
-            state = SYNCED;
-        else if (allocFlags & ALLOC_MAT)
-            state = HEAD_AT_MAT;
-        else
-            state = HEAD_AT_UMAT;
-    }
-#endif
-}
-
-void Blob::fill(InputArray in)
-{
-#ifdef CV_DNN_UMAT
-    CV_Assert(in.isMat() || in.isUMat());
-    if (in.isMat())
-    {
-        m = in.getMat();
-        state = HEAD_AT_MAT;
-    }
-    else
-    {
-        um = in.getUMat();
-        state = HEAD_AT_UMAT;
-    }
-#else
-    CV_Assert(in.isMat());
-    m = in.getMat();
-#endif
-}
-
-static inline int getMatChannels(const Mat &mat)
-{
-    return (mat.dims <= 2) ? mat.channels() : mat.size[0];
-}
-
-static BlobShape getBlobShape(std::vector<Mat> &vmat, int requestedCn = -1)
-{
-    BlobShape shape(BlobShape::all(4));
-    int cnSum = 0, matCn;
-
-    CV_Assert(vmat.size() > 0);
-
-    for (size_t i = 0; i < vmat.size(); i++)
-    {
-        Mat &mat = vmat[i];
-        CV_Assert(!mat.empty());
-        CV_Assert((mat.dims == 3 && mat.channels() == 1) || mat.dims <= 2);
-
-        matCn = getMatChannels(mat);
-        cnSum += getMatChannels(mat);
-
-        if (i == 0)
-        {
-            shape[-1] = mat.cols;
-            shape[-2] = mat.rows;
-            shape[-3] = (requestedCn <= 0) ? matCn : requestedCn;
-        }
-        else
-        {
-            if (mat.cols != shape[-1] || mat.rows != shape[-2])
-                CV_Error(Error::StsError, "Each Mat.size() must be equal");
-
-            if (requestedCn <= 0 && matCn != shape[-3])
-                CV_Error(Error::StsError, "Each Mat.chnannels() (or number of planes) must be equal");
-        }
-    }
-
-    if (cnSum % shape[-3] != 0)
-        CV_Error(Error::StsError, "Total number of channels in vector is not a multiple of requsted channel number");
-
-    shape[0] = cnSum / shape[-3];
-    return shape;
-}
-
-static std::vector<Mat> extractMatVector(InputArray in)
-{
-    if (in.isMat() || in.isUMat())
-    {
-        return std::vector<Mat>(1, in.getMat());
-    }
-    else if (in.isMatVector())
-    {
-        return *static_cast<const std::vector<Mat>*>(in.getObj());
-    }
-    else if (in.isUMatVector())
-    {
-        std::vector<Mat> vmat;
-        in.getMatVector(vmat);
-        return vmat;
-    }
-    else
-    {
-        CV_Assert(in.isMat() || in.isMatVector() || in.isUMat() || in.isUMatVector());
-        return std::vector<Mat>();
-    }
-}
-
-void Blob::batchFromImages(InputArray image, int dstCn)
-{
-    CV_Assert(dstCn == -1 || dstCn > 0);
-    std::vector<Mat> inMats = extractMatVector(image);
-    BlobShape dstShape = getBlobShape(inMats, dstCn);
-
-    int dtype = CV_32F;
-    this->create(dstShape, dtype, ALLOC_MAT);
-    uchar *dstPtr = this->matRef().ptr();
-    int elemSize = CV_ELEM_SIZE(dtype);
-
-    std::vector<Mat> wrapBuf(dstShape[-3]);
-    for (size_t i = 0; i < inMats.size(); i++)
-    {
-        Mat inMat = inMats[i];
-
-        if (inMat.dims <= 2)
-        {
-            inMat.convertTo(inMat, dtype);
-
-            wrapBuf.resize(0);
-            for (int cn = 0; cn < inMat.channels(); cn++)
-            {
-                wrapBuf.push_back(Mat(inMat.rows, inMat.cols, dtype, dstPtr));
-                dstPtr += elemSize * inMat.total();
-            }
-
-            cv::split(inMat, wrapBuf);
-        }
-        else
-        {
-            inMat.convertTo(Mat(inMat.dims, inMat.size, dtype, dstPtr), dtype);
-            dstPtr += elemSize * inMat.total();
-        }
-    }
-}
-
-Blob Blob::fromImages(InputArray image, int dstCn)
-{
-    Blob res;
-    res.batchFromImages(image, dstCn);
-    return res;
-}
-
-void Blob::fill(const BlobShape &shape, int type, void *data, bool deepCopy)
-{
-    if (deepCopy)
-    {
-        create(shape, type);
-        memcpy(ptr(), data, this->total() * CV_ELEM_SIZE(type));
-    }
-    else
-    {
-        m = Mat(shape.dims(), shape.ptr(), type, data);
-    }
-    CV_DNN_UMAT_ONLY(state = HEAD_AT_MAT);
-}
-
-void Blob::setTo(InputArray value, int allocFlags)
-{
-#ifdef CV_DNN_UMAT
-    if (allocFlags == -1)
-    {
-        if (state == HEAD_AT_UMAT)
-            um.setTo(value);
-        else if (state == HEAD_AT_MAT)
-            m.setTo(value);
-        else //SYNCED or UNINITIALIZED
-        {
-            um.setTo(value);
-            m.setTo(value);
-
-            if (state == UNINITIALIZED)
-                state = SYNCED;
-        }
-    }
-    else if (allocFlags == ALLOC_BOTH)
-    {
-        m.setTo(value);
-        um.setTo(value);
-        state = SYNCED;
-    }
-    else if (allocFlags == ALLOC_MAT)
-    {
-        matRef().setTo(value);
-    }
-    else if (allocFlags == ALLOC_UMAT)
-    {
-        umatRef().setTo(value);
-    }
-    else
-    {
-        CV_Error(Error::StsBadArg, "allocFlags sholud be -1 or one of Blob::AllocFlag values");
-    }
-#else
-    m.setTo(value);
-#endif
-}
-
-void Blob::updateMat(bool syncData) const
-{
-#ifdef CV_DNN_UMAT
-    if (state == UNINITIALIZED || state == SYNCED || state == HEAD_AT_MAT)
-    {
-        return;
-    }
-    else if (state == HEAD_AT_UMAT)
-    {
-        if (syncData)
-            um.copyTo(m);
-        else
-            m.create(dims(), sizes(), type());
-        state = SYNCED;
-    }
-    else
-    {
-        CV_Error(Error::StsInternal, "");
-    }
-#else
-    (void)syncData;
-#endif
-}
-
-void Blob::updateUMat(bool syncData) const
-{
-#ifdef CV_DNN_UMAT
-    if (state == UNINITIALIZED || state == SYNCED || state == HEAD_AT_UMAT)
-    {
-        return;
-    }
-    else if (state == HEAD_AT_MAT)
-    {
-        if (syncData)
-            m.copyTo(um);
-        else
-            um.create(dims(), sizes(), type());
-    }
-    else
-    {
-        CV_Error(Error::StsInternal, "");
-    }
-#else
-    (void)syncData;
-#endif
-}
-
-void Blob::sync() const
-{
-    updateMat();
-    updateUMat();
-}
-
-Vec4i Blob::shape4() const
-{
-    return Vec4i(num(), channels(), rows(), cols());
-}
-
-//BlobShape
-
-std::ostream &operator<< (std::ostream &stream, const BlobShape &shape)
-{
-    stream << "[";
-
-    for (int i = 0; i < shape.dims() - 1; i++)
-        stream << shape[i] << ", ";
-    if (shape.dims() > 0)
-        stream << shape[-1];
-
-    return stream << "]";
-}
-
-BlobShape computeShapeByReshapeMask(const BlobShape &srcShape, const BlobShape &maskShape, Range srcRange /*= Range::all()*/)
-{
-    if (srcRange == Range::all())
-        srcRange = Range(0, srcShape.dims());
-    else
-    {
-        int sz = srcRange.size();
-        srcRange.start = srcShape.canonicalAxis(srcRange.start);
-        srcRange.end =  (srcRange.end == INT_MAX) ? srcShape.dims() : srcRange.start + sz;
-    }
-
-    CV_Assert(0 <= srcRange.start && srcRange.start <= srcRange.end && srcRange.end <= srcShape.dims());
-    BlobShape dstShape(srcShape.dims() - srcRange.size() + maskShape.dims(), (const int*)NULL);
-
-    std::copy(srcShape.ptr(), srcShape.ptr() + srcRange.start, dstShape.ptr());
-    std::copy(srcShape.ptr() + srcRange.end, srcShape.ptr() + srcShape.dims(), dstShape.ptr() + srcRange.start + maskShape.dims());
-
-    int inferDim = -1;
-    for (int i = 0; i < maskShape.dims(); i++)
-    {
-        if (maskShape[i] > 0)
-        {
-            dstShape[srcRange.start + i] = maskShape[i];
-        }
-        else if (maskShape[i] == 0)
-        {
-            if (srcRange.start + i >= srcShape.dims())
-                CV_Error(Error::StsBadArg, format("Copy dim[%d] (which has zero size) is out of the source shape bounds", srcRange.start + i));
-            dstShape[srcRange.start + i] = srcShape[srcRange.start + i];
-        }
-        else if (maskShape[i] == -1)
-        {
-            if (inferDim != -1)
-                CV_Error(Error::StsAssert, "Duplicate of inferred dim (which is denoted by -1)");
-            inferDim = srcRange.start + i;
-            dstShape[inferDim] = 1;
-        }
-        else
-            CV_Error(Error::StsBadArg, "maskShape[i] >= -1");
-    }
-
-    if (inferDim != -1)
-    {
-        ptrdiff_t srcTotal = srcShape.total();
-        ptrdiff_t dstTotal = dstShape.total();
-        if (srcTotal % dstTotal != 0)
-            CV_Error(Error::StsBackTrace, "Can't infer a dim denoted by -1");
-
-        dstShape[inferDim] = (int)(srcTotal / dstTotal);
-    }
-    else
-    {
-        CV_Assert(srcShape.total() == dstShape.total());
-    }
-
-    return dstShape;
-}
-
-}
-}
diff --git a/modules/dnn/src/caffe/caffe_importer.cpp b/modules/dnn/src/caffe/caffe_importer.cpp
index 312671e3eab..c41f0925730 100644
--- a/modules/dnn/src/caffe/caffe_importer.cpp
+++ b/modules/dnn/src/caffe/caffe_importer.cpp
@@ -192,38 +192,37 @@ class CaffeImporter : public Importer
         }
     }
 
-    BlobShape blobShapeFromProto(const caffe::BlobProto &pbBlob)
+    void blobShapeFromProto(const caffe::BlobProto &pbBlob, std::vector<int>& shape)
     {
+        shape.clear();
         if (pbBlob.has_num() || pbBlob.has_channels() || pbBlob.has_height() || pbBlob.has_width())
         {
-            return BlobShape(pbBlob.num(), pbBlob.channels(), pbBlob.height(), pbBlob.width());
+            shape.push_back(pbBlob.num());
+            shape.push_back(pbBlob.channels());
+            shape.push_back(pbBlob.height());
+            shape.push_back(pbBlob.width());
         }
         else if (pbBlob.has_shape())
         {
             const caffe::BlobShape &_shape = pbBlob.shape();
-            BlobShape shape = BlobShape::all(_shape.dim_size());
 
             for (int i = 0; i < _shape.dim_size(); i++)
-                shape[i] = (int)_shape.dim(i);
-
-            return shape;
+                shape.push_back((int)_shape.dim(i));
         }
         else
-        {
             CV_Error(Error::StsError, "Unknown shape of input blob");
-            return BlobShape();
-        }
     }
 
-    void blobFromProto(const caffe::BlobProto &pbBlob, cv::dnn::Blob &dstBlob)
+    void blobFromProto(const caffe::BlobProto &pbBlob, cv::Mat &dstBlob)
     {
-        BlobShape shape = blobShapeFromProto(pbBlob);
+        std::vector<int> shape;
+        blobShapeFromProto(pbBlob, shape);
 
-        dstBlob.create(shape, CV_32F);
-        CV_Assert(pbBlob.data_size() == (int)dstBlob.matRefConst().total());
+        dstBlob.create((int)shape.size(), &shape[0], CV_32F);
+        CV_Assert(pbBlob.data_size() == (int)dstBlob.total());
 
         CV_DbgAssert(pbBlob.GetDescriptor()->FindFieldByLowercaseName("data")->cpp_type() == FieldDescriptor::CPPTYPE_FLOAT);
-        float *dstData = dstBlob.matRef().ptr<float>();
+        float *dstData = dstBlob.ptr<float>();
 
         for (int i = 0; i < pbBlob.data_size(); i++)
             dstData[i] = pbBlob.data(i);
diff --git a/modules/dnn/src/caffe/layer_loaders.cpp b/modules/dnn/src/caffe/layer_loaders.cpp
deleted file mode 100644
index 571d27a5bda..00000000000
--- a/modules/dnn/src/caffe/layer_loaders.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-#include "../precomp.hpp"
-#include "layer_loaders.hpp"
-#include <opencv2/dnn/shape_utils.hpp>
-#include <climits>
-#include "layers/layers_common.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-
-//Layers
-
-//Convolution and Deconvolution
-static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, LayerParams &params)
-{
-    l->setParamsFrom(params);
-    getConvolutionKernelParams(params, l->kernel.height, l->kernel.width, l->pad.height,
-                               l->pad.width, l->stride.height, l->stride.width, l->dilation.height,
-                               l->dilation.width, l->padMode);
-
-    bool bias = params.get<bool>("bias_term", true);
-    int numOutput = params.get<int>("num_output");
-    int group = params.get<int>("group", 1);
-
-    l->adjustPad.height = params.get<int>("adj_h", 0);
-    l->adjustPad.width = params.get<int>("adj_w", 0);
-
-    CV_Assert(numOutput % group == 0);
-    CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1));
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<ConvolutionLayer>(LayerParams &params)
-{
-    Ptr<BaseConvolutionLayer> l = ConvolutionLayer::create();
-    initConvDeconvLayerFromCaffe(l, params);
-    return Ptr<Layer>(l);
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<DeconvolutionLayer>(LayerParams &params)
-{
-    Ptr<BaseConvolutionLayer> l = DeconvolutionLayer::create();
-    initConvDeconvLayerFromCaffe(l, params);
-
-    return Ptr<Layer>(l);
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<PoolingLayer>(LayerParams &params)
-{
-    int type = PoolingLayer::MAX;
-    Size kernel, stride, pad;
-    bool globalPooling;
-    cv::String padMode;
-
-    if (params.has("pool"))
-    {
-        String pool = params.get<String>("pool").toLowerCase();
-        if (pool == "max")
-            type = PoolingLayer::MAX;
-        else if (pool == "ave")
-            type = PoolingLayer::AVE;
-        else if (pool == "stochastic")
-            type = PoolingLayer::STOCHASTIC;
-        else
-            CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
-    }
-
-    getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
-                           pad.height, pad.width, stride.height, stride.width, padMode);
-    //getCaffeConvParams(params, kernel, pad, stride);
-
-    Ptr<Layer> l;
-    if (!globalPooling)
-        l = PoolingLayer::create(type, kernel, stride, pad, padMode);
-    else
-        l = PoolingLayer::createGlobal(type);
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<SoftmaxLayer>(LayerParams &params)
-{
-    int axis = params.get<int>("axis", 1);
-    Ptr<Layer> l(SoftmaxLayer::create(axis));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<> //InnerProduct specialization
-Ptr<Layer> createLayerFromCaffe<InnerProductLayer>(LayerParams &params)
-{
-    const std::vector<Blob> &blobs = params.blobs;
-    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
-
-    int numOutputs = params.get<int>("num_output");
-    int innerSize = (int)blobs[0].total() / numOutputs;
-    bool bias = params.get<bool>("bias_term", true);
-    int axis = params.get<int>("axis", 1);
-
-    CV_Assert(blobs[0].dims() >= 2 && (size_t)(innerSize * numOutputs) == blobs[0].total());
-    CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutputs == blobs[1].total()));
-
-    Ptr<InnerProductLayer> l = InnerProductLayer::create(axis);
-    l->setParamsFrom(params);
-    l->blobs[0].reshape(Shape(numOutputs, innerSize));
-    if (bias)
-        l->blobs[1].reshape(Shape(1, numOutputs));
-
-    return Ptr<Layer>(l);
-}
-
-template<> //LRNLayer specialization
-Ptr<Layer> createLayerFromCaffe<LRNLayer>(LayerParams& params)
-{
-    int type = -1;
-    String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
-    if (nrmType == "ACROSS_CHANNELS")
-        type = LRNLayer::CHANNEL_NRM;
-    else if (nrmType == "WITHIN_CHANNEL")
-        type = LRNLayer::SPATIAL_NRM;
-    else
-        CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
-
-    int size = params.get<int>("local_size", 5);
-    if (size % 2 != 1 || size <= 0)
-        CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
-
-    double alpha = params.get<double>("alpha", 1);
-    double beta = params.get<double>("beta", 0.75);
-    double bias = params.get<double>("bias", 1);
-    bool normBySize = params.get<bool>("norm_by_size", true);
-
-    Ptr<Layer> l(LRNLayer::create(type, size, alpha, beta, bias, normBySize));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<MVNLayer>(LayerParams &params)
-{
-    Ptr<Layer> l(MVNLayer::create(
-        params.get<bool>("normalize_variance", true),
-        params.get<bool>("across_channels", false),
-        params.get<double>("eps", 1e-9)
-    ));
-    l->setParamsFrom(params);
-    return l;
-}
-
-/* Reshape layers */
-
-template<>
-Ptr<Layer> createLayerFromCaffe<ReshapeLayer>(LayerParams &params)
-{
-    int axis = params.get<int>("axis", 0);
-    int numAxes = params.get<int>("num_axes", -1);
-    bool enableReordering = params.get<bool>("reorder_dims", false);
-    CV_Assert(numAxes >= -1);
-    Range applyingRange = (numAxes == -1) ? Range(axis, INT_MAX) : Range(axis, axis + numAxes);
-
-    Shape newShape;
-    if (params.has("dim"))
-    {
-        const DictValue &paramShape = params.get("dim");
-        newShape = Shape::all(paramShape.size());
-        for (int i = 0; i < paramShape.size(); i++)
-            newShape[i] = paramShape.get<int>(i);
-    }
-    else
-        newShape = Shape::all(0);
-
-    Ptr<Layer> l(ReshapeLayer::create(newShape, applyingRange, enableReordering));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<ConcatLayer>(LayerParams& params)
-{
-    Ptr<Layer> l(ConcatLayer::create(params.get<int>("axis", 1)));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<SplitLayer>(LayerParams &params)
-{
-    int outputsCount;
-
-    //TODO: maybe "top_count" param is useless because it can be determined by output connections number
-    if (params.has("top_count"))
-    {
-        outputsCount = params.get<int>("top_count");
-        CV_Assert(outputsCount >= 0);
-    }
-    else
-    {
-        outputsCount = -1;
-    }
-
-    Ptr<Layer> l(SplitLayer::create(outputsCount));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<>
-Ptr<Layer> createLayerFromCaffe<SliceLayer>(LayerParams& params)
-{
-    int axis = params.get<int>("axis", 1);
-
-    Ptr<Layer> l;
-    if (!params.has("slice_point"))
-    {
-        l = SliceLayer::create(axis);
-    }
-    else
-    {
-        const DictValue &indicesValue = params.get("slice_point");
-        std::vector<int> sliceIndices(indicesValue.size());
-        for (int i = 0; i < indicesValue.size(); i++)
-            sliceIndices[i] = indicesValue.get<int>(i);
-
-        l = SliceLayer::create(axis, sliceIndices);
-    }
-    l->setParamsFrom(params);
-    return l;
-}
-
-/* Activation layers */
-
-template <typename ActivationLayer> //Intended for parameters-free activations
-Ptr<Layer> createLayerFromCaffe(LayerParams&)
-{
-    return Ptr<Layer>(ActivationLayer::create());
-}
-
-template<> //ReLU specialization
-Ptr<Layer> createLayerFromCaffe<ReLULayer>(LayerParams& params)
-{
-    float negative_slope = params.get<float>("negative_slope", 0.f);
-    Ptr<Layer> l(ReLULayer::create(negative_slope));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<> //Power specialization
-Ptr<Layer> createLayerFromCaffe<PowerLayer>(LayerParams& params)
-{
-    float power = params.get<float>("power", 1.0f);
-    float scale = params.get<float>("scale", 1.0f);
-    float shift = params.get<float>("shift", 0.0f);
-    Ptr<Layer> l(PowerLayer::create(power, scale, shift));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<> //CropLayer specialization
-Ptr<Layer> createLayerFromCaffe<CropLayer>(LayerParams& params)
-{
-    int start_axis = params.get<int>("axis", 2);
-    DictValue *paramOffset = params.ptr("offset");
-
-    std::vector<int> offset;
-    if (paramOffset)
-    {
-        for (int i = 0; i < paramOffset->size(); i++)
-            offset.push_back(paramOffset->get<int>(i));
-    }
-
-    Ptr<Layer> l(CropLayer::create(start_axis, offset));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<> //Eltwise specialization
-Ptr<Layer> createLayerFromCaffe<EltwiseLayer>(LayerParams& params)
-{
-    EltwiseLayer::EltwiseOp op = EltwiseLayer::SUM;
-    if (params.has("operation"))
-    {
-        String operation = params.get<String>("operation").toLowerCase();
-        if (operation == "prod")
-            op = EltwiseLayer::PROD;
-        else if (operation == "sum")
-            op = EltwiseLayer::SUM;
-        else if (operation == "max")
-            op = EltwiseLayer::MAX;
-        else
-            CV_Error(cv::Error::StsBadArg, "Unknown operaticon type \"" + operation + "\"");
-    }
-
-    std::vector<int> coeffs;
-    if (params.has("coeff"))
-    {
-        DictValue paramCoeff = params.get("coeff");
-        coeffs.resize(paramCoeff.size(), 1);
-        for (int i = 0; i < paramCoeff.size(); i++)
-        {
-            coeffs[i] = paramCoeff.get<int>(i);
-        }
-    }
-    Ptr<Layer> l(EltwiseLayer::create(op, coeffs));
-    l->setParamsFrom(params);
-    return l;
-}
-
-template<> //BatchNormLayer specialization
-Ptr<Layer> createLayerFromCaffe<BatchNormLayer>(LayerParams& params)
-{
-    const std::vector<Blob> &blobs = params.blobs;
-    CV_Assert(blobs.size() >= 3);
-
-    bool hasWeights = params.get<bool>("has_weight", false);
-    bool hasBias = params.get<bool>("has_bias", false);
-    float epsilon = params.get<float>("eps", 1E-5);
-    Ptr<BatchNormLayer> l = BatchNormLayer::create(hasWeights, hasBias, epsilon);
-    l->setParamsFrom(params);
-
-    return Ptr<Layer>(l);
-}
-
-template<> //ChannelsPReLULayer specialization
-Ptr<Layer> createLayerFromCaffe<ChannelsPReLULayer>(LayerParams& params)
-{
-   CV_Assert(params.blobs.size() == 1);
-   Ptr<ChannelsPReLULayer> l = ChannelsPReLULayer::create();
-   l->setParamsFrom(params);
-
-   return Ptr<Layer>(l);
-}
-
-template<> //MaxUnpoolLayer specialization
-Ptr<Layer> createLayerFromCaffe<MaxUnpoolLayer>(LayerParams& params)
-{
-   Size poolKernel(params.get<int>("pool_k_w"), params.get<int>("pool_k_h")),
-        poolPad(params.get<int>("pool_pad_w"), params.get<int>("pool_pad_h")),
-        poolStride(params.get<int>("pool_stride_w"), params.get<int>("pool_stride_h"));
-   Ptr<MaxUnpoolLayer> l = MaxUnpoolLayer::create(poolKernel, poolPad, poolStride);
-   l->setParamsFrom(params);
-
-   return Ptr<Layer>(l);
-}
-
-template<> //ScaleLayer specialization
-Ptr<Layer> createLayerFromCaffe<ScaleLayer>(LayerParams& params)
-{
-   Ptr<ScaleLayer> l = ScaleLayer::create(params.get<bool>("bias_term", false));
-   l->setParamsFrom(params);
-
-   return Ptr<Layer>(l);
-}
-
-//Explicit instantiation
-template Ptr<Layer> createLayerFromCaffe<ConvolutionLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<DeconvolutionLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<SoftmaxLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<InnerProductLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<LRNLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<MVNLayer>(LayerParams&);
-
-template Ptr<Layer> createLayerFromCaffe<ConcatLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<SliceLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<SplitLayer>(LayerParams&);
-
-template Ptr<Layer> createLayerFromCaffe<ReLULayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<SigmoidLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<TanHLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<AbsLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<BNLLLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<PowerLayer>(LayerParams&);
-
-template Ptr<Layer> createLayerFromCaffe<CropLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<EltwiseLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<BatchNormLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<ChannelsPReLULayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<MaxUnpoolLayer>(LayerParams&);
-template Ptr<Layer> createLayerFromCaffe<ScaleLayer>(LayerParams&);
-}
-}
diff --git a/modules/dnn/src/caffe/layer_loaders.hpp b/modules/dnn/src/caffe/layer_loaders.hpp
deleted file mode 100644
index 617691cb467..00000000000
--- a/modules/dnn/src/caffe/layer_loaders.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_CAFFE_LAYER_LOADERS_HPP__
-#define __OPENCV_DNN_CAFFE_LAYER_LOADERS_HPP__
-
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-//Common template for Caffe layer loaders
-template <typename PublicLayer>
-Ptr<Layer> createLayerFromCaffe(LayerParams&);
-
-Ptr<Layer> createFlattenLayerFromCaffe(LayerParams&);
-
-}
-}
-#endif
\ No newline at end of file
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp
index 26d21a31128..4be9138f936 100644
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -67,6 +67,65 @@ static String toString(const T &v)
     return ss.str();
 }
 
+Mat blobFromImage(const Mat& image_, double scalefactor, bool swapRB)
+{
+    Mat image;
+    if(image_.depth() == CV_8U)
+    {
+        image_.convertTo(image, CV_32F, scalefactor);
+    }
+    else
+        image = image_;
+    CV_Assert(image.dims == 2 && image.depth() == CV_32F);
+    int nch = image.channels();
+    CV_Assert(nch == 3 || nch == 4);
+    int sz[] = { 1, 3, image.rows, image.cols };
+    Mat blob(4, sz, CV_32F);
+    Mat ch[4];
+    for( int j = 0; j < 3; j++ )
+        ch[j] = Mat(image.rows, image.cols, CV_32F, blob.ptr(0, j));
+    if(swapRB)
+        std::swap(ch[0], ch[2]);
+    split(image, ch);
+    return blob;
+}
+
+Mat blobFromImages(const std::vector<Mat>& images, double scalefactor, bool swapRB)
+{
+    size_t i, nimages = images.size();
+    if(nimages == 0)
+        return Mat();
+    Mat image0 = images[0];
+    int nch = image0.channels();
+    CV_Assert(image0.dims == 2 && (nch == 3 || nch == 4));
+    int sz[] = { (int)nimages, 3, image0.rows, image0.cols };
+    Mat blob(4, sz, CV_32F), image;
+    Mat ch[4];
+
+    for( i = 0; i < nimages; i++ )
+    {
+        Mat image_ = images[i];
+        if(image_.depth() == CV_8U)
+        {
+            image_.convertTo(image, CV_32F, scalefactor);
+        }
+        else
+            image = image_;
+        CV_Assert(image.depth() == CV_32F);
+        nch = image.channels();
+        CV_Assert(image.dims == 2 && (nch == 3 || nch == 4));
+        CV_Assert(image.size() == image0.size());
+
+        for( int j = 0; j < 3; j++ )
+            ch[j] = Mat(image.rows, image.cols, CV_32F, blob.ptr((int)i, j));
+        if(swapRB)
+            std::swap(ch[0], ch[2]);
+        split(image, ch);
+    }
+    return blob;
+}
+
+
 struct LayerPin
 {
     int lid;
@@ -107,8 +166,8 @@ struct LayerData
     std::set<int> requiredOutputs;
 
     Ptr<Layer> layerInstance;
-    std::vector<Blob> outputBlobs;
-    std::vector<Blob*> inputBlobs;
+    std::vector<Mat> outputBlobs;
+    std::vector<Mat*> inputBlobs;
 
     int flag;
 
@@ -130,8 +189,8 @@ struct LayerData
 //fake layer containing network input blobs
 struct DataLayer : public Layer
 {
-    void allocate(const std::vector<Blob*>&, std::vector<Blob>&) {}
-    void forward(std::vector<Blob*>&, std::vector<Blob>&) {}
+    void allocate(const std::vector<Mat*>&, std::vector<Mat>&) {}
+    void forward(std::vector<Mat*>&, std::vector<Mat>&) {}
 
     int outputNameToIndex(String tgtName)
     {
@@ -348,8 +407,27 @@ struct Net::Impl
         if (ld.flag)
             return;
 
+        size_t ninputs = ld.inputBlobsId.size();
+#if 0
+        printf("layer %s:", ld.name.c_str());
+        for (size_t i = 0; i < ninputs; i++)
+        {
+            int inp_lid = ld.inputBlobsId[i].lid;
+            LayerData &inp_ld = layers[inp_lid];
+            int inp_outputs = (int)inp_ld.outputBlobs.size();
+            std::cout << " " << inp_ld.name << "(" << inp_outputs;
+
+            for( int j = 0; j < inp_outputs; j++ )
+            {
+                std::cout << (j == 0 ? ": " : ", ") << inp_ld.outputBlobs[j].size;
+            }
+            std::cout << ")";
+        }
+        printf("\n");
+#endif
+
         //determine parent layers
-        for (size_t i = 0; i < ld.inputBlobsId.size(); i++)
+        for (size_t i = 0; i < ninputs; i++)
             ld.inputLayersId.insert(ld.inputBlobsId[i].lid);
 
         //allocate parents
@@ -357,8 +435,8 @@ struct Net::Impl
             allocateLayer(*i);
 
         //bind inputs
-        ld.inputBlobs.resize(ld.inputBlobsId.size());
-        for (size_t i = 0; i < ld.inputBlobsId.size(); i++)
+        ld.inputBlobs.resize(ninputs);
+        for (size_t i = 0; i < ninputs; i++)
         {
             LayerPin from = ld.inputBlobsId[i];
             CV_Assert(from.valid());
@@ -368,15 +446,24 @@ struct Net::Impl
 
         //allocate layer
         ld.outputBlobs.resize(std::max((size_t)1, ld.requiredOutputs.size())); //layer produce at least one output blob
-        try
+        //try
         {
             Ptr<Layer> layerPtr = ld.getLayerInstance();
             layerPtr->allocate(ld.inputBlobs, ld.outputBlobs);
+#if 0
+            std::cout << "\toutputs:";
+            size_t noutputs = ld.outputBlobs.size();
+            for (size_t j = 0; j < noutputs; j++)
+            {
+                std::cout << (j == 0 ? " " : ", ") << ld.outputBlobs[j].size;
+            }
+            std::cout << "\n";
+#endif
         }
-        catch (const cv::Exception &err)
+        /*catch (const cv::Exception &err)
         {
             CV_RETHROW_ERROR(err, format("The following error occured while making allocate() for layer \"%s\": %s", ld.name.c_str(), err.err.c_str()));
-        }
+        }*/
 
         ld.flag = 1;
     }
@@ -414,14 +501,14 @@ struct Net::Impl
         }
 
         //forward itself
-        try
+        //try
         {
             ld.layerInstance->forward(ld.inputBlobs, ld.outputBlobs);
         }
-        catch (const cv::Exception &err)
+        /*catch (const cv::Exception &err)
         {
             CV_RETHROW_ERROR(err, format("The following error occured while making forward() for layer \"%s\": %s", ld.name.c_str(), err.err.c_str()));
-        }
+        }*/
 
         ld.flag = 1;
     }
@@ -509,7 +596,7 @@ void Net::setNetInputs(const std::vector<String> &inputBlobNames)
     impl->netInputLayer->setNames(inputBlobNames);
 }
 
-void Net::setBlob(String outputName, const Blob &blob)
+void Net::setBlob(String outputName, const Mat &blob_)
 {
     LayerPin pin = impl->getPinByAlias(outputName);
     if (!pin.valid())
@@ -517,10 +604,10 @@ void Net::setBlob(String outputName, const Blob &blob)
 
     LayerData &ld = impl->layers[pin.lid];
     ld.outputBlobs.resize( std::max(pin.oid+1, (int)ld.requiredOutputs.size()) );
-    ld.outputBlobs[pin.oid] = blob;
+    ld.outputBlobs[pin.oid] = blob_.clone();
 }
 
-Blob Net::getBlob(String outputName)
+Mat Net::getBlob(String outputName)
 {
     LayerPin pin = impl->getPinByAlias(outputName);
     if (!pin.valid())
@@ -535,20 +622,20 @@ Blob Net::getBlob(String outputName)
     return ld.outputBlobs[pin.oid];
 }
 
-Blob Net::getParam(LayerId layer, int numParam)
+Mat Net::getParam(LayerId layer, int numParam)
 {
     LayerData &ld = impl->getLayerData(layer);
 
-    std::vector<Blob> &layerBlobs = ld.layerInstance->blobs;
+    std::vector<Mat> &layerBlobs = ld.layerInstance->blobs;
     CV_Assert(numParam < (int)layerBlobs.size());
     return layerBlobs[numParam];
 }
 
-void Net::setParam(LayerId layer, int numParam, const Blob &blob)
+void Net::setParam(LayerId layer, int numParam, const Mat &blob)
 {
     LayerData &ld = impl->getLayerData(layer);
 
-    std::vector<Blob> &layerBlobs = ld.layerInstance->blobs;
+    std::vector<Mat> &layerBlobs = ld.layerInstance->blobs;
     CV_Assert(numParam < (int)layerBlobs.size());
     //we don't make strong checks, use this function carefully
     layerBlobs[numParam] = blob;
@@ -662,30 +749,30 @@ static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
         pv[i] = const_cast<T*>(&v[i]);
 }
 
-void Layer::allocate(const std::vector<Blob> &inputs, std::vector<Blob> &outputs)
+void Layer::allocate(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
 {
-    std::vector<Blob*> inputsp;
+    std::vector<Mat*> inputsp;
     vecToPVec(inputs, inputsp);
     this->allocate(inputsp, outputs);
 }
 
-std::vector<Blob> Layer::allocate(const std::vector<Blob> &inputs)
+std::vector<Mat> Layer::allocate(const std::vector<Mat> &inputs)
 {
-    std::vector<Blob> outputs;
+    std::vector<Mat> outputs;
     this->allocate(inputs, outputs);
     return outputs;
 }
 
-void Layer::forward(const std::vector<Blob> &inputs, std::vector<Blob> &outputs)
+void Layer::forward(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
 {
-    std::vector<Blob*> inputsp;
+    std::vector<Mat*> inputsp;
     vecToPVec(inputs, inputsp);
     this->forward(inputsp, outputs);
 }
 
-void Layer::run(const std::vector<Blob> &inputs, std::vector<Blob> &outputs)
+void Layer::run(const std::vector<Mat> &inputs, std::vector<Mat> &outputs)
 {
-    std::vector<Blob*> inputsp;
+    std::vector<Mat*> inputsp;
     vecToPVec(inputs, inputsp);
     this->allocate(inputsp, outputs);
     this->forward(inputsp, outputs);
diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp
index 8d92d61aa34..e9b03464e76 100644
--- a/modules/dnn/src/init.cpp
+++ b/modules/dnn/src/init.cpp
@@ -40,19 +40,6 @@
 //M*/
 
 #include "precomp.hpp"
-#include "caffe/layer_loaders.hpp"
-#include "layers/blank_layer.hpp"
-
-#include "layers/crop_layer.hpp"
-#include "layers/eltwise_layer.hpp"
-#include "layers/flatten_layer.hpp"
-#include "layers/permute_layer.hpp"
-#include "layers/prior_box_layer.hpp"
-#include "layers/detection_output_layer.hpp"
-#include "layers/normalize_bbox_layer.hpp"
-#include "layers/shift_layer.hpp"
-#include "layers/padding_layer.hpp"
-#include "layers/scale_layer.hpp"
 
 namespace cv
 {
@@ -65,7 +52,7 @@ struct AutoInitializer
 
     AutoInitializer() : status(false)
     {
-        cv::dnn::initModule();
+        initModule();
     }
 };
 
@@ -76,41 +63,41 @@ void initModule()
     if (init.status)
         return;
 
-    REG_RUNTIME_LAYER_FUNC(Slice,           createLayerFromCaffe<SliceLayer>);
-    REG_RUNTIME_LAYER_FUNC(Split,           createLayerFromCaffe<SplitLayer>);
-    REG_RUNTIME_LAYER_FUNC(Concat,          createLayerFromCaffe<ConcatLayer>);
-    REG_RUNTIME_LAYER_FUNC(Reshape,         createLayerFromCaffe<ReshapeLayer>);
+    REG_RUNTIME_LAYER_CLASS(Slice,          SliceLayer);
+    REG_RUNTIME_LAYER_CLASS(Split,          SplitLayer);
+    REG_RUNTIME_LAYER_CLASS(Concat,         ConcatLayer);
+    REG_RUNTIME_LAYER_CLASS(Reshape,        ReshapeLayer);
     REG_RUNTIME_LAYER_CLASS(Flatten,        FlattenLayer);
 
-    REG_RUNTIME_LAYER_FUNC(Convolution,     createLayerFromCaffe<ConvolutionLayer>);
-    REG_RUNTIME_LAYER_FUNC(Deconvolution,   createLayerFromCaffe<DeconvolutionLayer>);
-    REG_RUNTIME_LAYER_FUNC(Pooling,         createLayerFromCaffe<PoolingLayer>);
-    REG_RUNTIME_LAYER_FUNC(LRN,             createLayerFromCaffe<LRNLayer>);
-    REG_RUNTIME_LAYER_FUNC(InnerProduct,    createLayerFromCaffe<InnerProductLayer>);
-    REG_RUNTIME_LAYER_FUNC(Softmax,         createLayerFromCaffe<SoftmaxLayer>);
-    REG_RUNTIME_LAYER_FUNC(MVN,             createLayerFromCaffe<MVNLayer>);
+    REG_RUNTIME_LAYER_CLASS(Convolution,    ConvolutionLayer);
+    REG_RUNTIME_LAYER_CLASS(Deconvolution,  DeconvolutionLayer);
+    REG_RUNTIME_LAYER_CLASS(Pooling,        PoolingLayer);
+    REG_RUNTIME_LAYER_CLASS(LRN,            LRNLayer);
+    REG_RUNTIME_LAYER_CLASS(InnerProduct,   InnerProductLayer);
+    REG_RUNTIME_LAYER_CLASS(Softmax,        SoftmaxLayer);
+    REG_RUNTIME_LAYER_CLASS(MVN,            MVNLayer);
 
-    REG_RUNTIME_LAYER_FUNC(ReLU,            createLayerFromCaffe<ReLULayer>);
-    REG_RUNTIME_LAYER_FUNC(ChannelsPReLU,   createLayerFromCaffe<ChannelsPReLULayer>);
-    REG_RUNTIME_LAYER_FUNC(Sigmoid,         createLayerFromCaffe<SigmoidLayer>);
-    REG_RUNTIME_LAYER_FUNC(TanH,            createLayerFromCaffe<TanHLayer>);
-    REG_RUNTIME_LAYER_FUNC(BNLL,            createLayerFromCaffe<BNLLLayer>);
-    REG_RUNTIME_LAYER_FUNC(AbsVal,          createLayerFromCaffe<AbsLayer>);
-    REG_RUNTIME_LAYER_FUNC(Power,           createLayerFromCaffe<PowerLayer>);
-    REG_RUNTIME_LAYER_FUNC(BatchNorm,       createLayerFromCaffe<BatchNormLayer>);
-    REG_RUNTIME_LAYER_FUNC(MaxUnpool,       createLayerFromCaffe<MaxUnpoolLayer>);
+    REG_RUNTIME_LAYER_CLASS(ReLU,           ReLULayer);
+    REG_RUNTIME_LAYER_CLASS(ChannelsPReLU,  ChannelsPReLULayer);
+    REG_RUNTIME_LAYER_CLASS(Sigmoid,        SigmoidLayer);
+    REG_RUNTIME_LAYER_CLASS(TanH,           TanHLayer);
+    REG_RUNTIME_LAYER_CLASS(BNLL,           BNLLLayer);
+    REG_RUNTIME_LAYER_CLASS(AbsVal,         AbsLayer);
+    REG_RUNTIME_LAYER_CLASS(Power,          PowerLayer);
+    REG_RUNTIME_LAYER_CLASS(BatchNorm,      BatchNormLayer);
+    REG_RUNTIME_LAYER_CLASS(MaxUnpool,      MaxUnpoolLayer);
     REG_RUNTIME_LAYER_CLASS(Dropout,        BlankLayer);
     REG_RUNTIME_LAYER_CLASS(Identity,       BlankLayer);
 
-    REG_RUNTIME_LAYER_FUNC(Crop,            createLayerFromCaffe<CropLayer>);
-    REG_RUNTIME_LAYER_FUNC(Eltwise,         createLayerFromCaffe<EltwiseLayer>);
+    REG_RUNTIME_LAYER_CLASS(Crop,           CropLayer);
+    REG_RUNTIME_LAYER_CLASS(Eltwise,        EltwiseLayer);
     REG_RUNTIME_LAYER_CLASS(Permute,        PermuteLayer);
     REG_RUNTIME_LAYER_CLASS(PriorBox,       PriorBoxLayer);
     REG_RUNTIME_LAYER_CLASS(DetectionOutput, DetectionOutputLayer);
     REG_RUNTIME_LAYER_CLASS(NormalizeBBox,  NormalizeBBoxLayer);
     REG_RUNTIME_LAYER_CLASS(Shift,          ShiftLayer);
     REG_RUNTIME_LAYER_CLASS(Padding,        PaddingLayer);
-    REG_RUNTIME_LAYER_FUNC(Scale,           createLayerFromCaffe<ScaleLayer>);
+    REG_RUNTIME_LAYER_CLASS(Scale,          ScaleLayer);
 
     init.status = true;
 }
diff --git a/modules/dnn/src/layers/batch_norm_layer.cpp b/modules/dnn/src/layers/batch_norm_layer.cpp
index 7f9109e3afa..04ef3c234ca 100644
--- a/modules/dnn/src/layers/batch_norm_layer.cpp
+++ b/modules/dnn/src/layers/batch_norm_layer.cpp
@@ -9,78 +9,95 @@
 Implementation of Batch Normalization layer.
 */
 
-#include "batch_norm_layer.hpp"
+#include "../precomp.hpp"
 
 namespace cv
 {
 namespace dnn
 {
 
-BatchNormLayerImpl::BatchNormLayerImpl(bool hasWeights_, bool hasBias_, float epsilon_):
-    hasWeights(hasWeights_),
-    hasBias(hasBias_),
-    epsilon(epsilon_)
-{}
-
-void BatchNormLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+class BatchNormLayerImpl : public BatchNormLayer
 {
-    CV_Assert(blobs.size() >= 2);
-
-    outputs.resize(inputs.size());
-    for (size_t i = 0; i < inputs.size(); i++)
+public:
+    BatchNormLayerImpl(const LayerParams& params)
     {
-        CV_Assert(blobs[0].total() == inputs[i]->channels());
-        CV_Assert(blobs[1].total() == inputs[i]->channels());
-        outputs[i].create(inputs[i]->shape());
-    }
-}
-
-void BatchNormLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 1);
+        setParamsFrom(params);
+        CV_Assert(blobs.size() >= 3);
 
-    Blob &inpBlob = *inputs[0];
-
-    int weightsBlobIndex = 2;
-    int biasBlobIndex = weightsBlobIndex + hasWeights;
-
-    float varMeanScale = 1;
-    if (!hasWeights && !hasBias) {
-        varMeanScale = *blobs[2].ptrf();
-        if (varMeanScale != 0)
-            varMeanScale = 1/varMeanScale;
+        hasWeights = params.get<bool>("has_weight", false);
+        hasBias = params.get<bool>("has_bias", false);
+        epsilon = params.get<float>("eps", 1E-5);
     }
 
-    Mat invStdMat;
-    cv::pow(blobs[1].matRefConst()*varMeanScale + epsilon, -0.5, invStdMat);
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(blobs.size() >= 2);
+
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(blobs[0].total() == inputs[i]->size[1]);
+            CV_Assert(blobs[1].total() == inputs[i]->size[1]);
+            Mat* inp = inputs[i];
+            outputs[i].create(inp->dims, &inp->size.p[0], inp->type());
+        }
+    }
 
-    for (size_t ii = 0; ii < outputs.size(); ii++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-      Blob &outBlob = outputs[ii];
-
-      if (hasWeights)
-        CV_Assert(inpBlob.channels() == blobs[weightsBlobIndex].total());
-
-      if (hasBias)
-        CV_Assert(inpBlob.channels() == blobs[biasBlobIndex].total());
-
-      for(int num = 0; num < outBlob.num(); num++)
-      {
-          for (int n = 0; n < outBlob.channels(); n++)
-          {
-              float mean = blobs[0].matRefConst().at<float>(n)*varMeanScale;
-              double invstd = invStdMat.at<float>(n);
-              float w = hasWeights ? blobs[weightsBlobIndex].matRefConst().at<float>(n) : 1;
-              float b = hasBias ? blobs[biasBlobIndex].matRefConst().at<float>(n) : 0;
-              outBlob.getPlane(num, n) = (inpBlob.getPlane(num, n) - mean)*w*invstd + b;
-          }
-      }
+        CV_Assert(inputs.size() == 1);
+
+        Mat &inpBlob = *inputs[0];
+
+        int weightsBlobIndex = 2;
+        int biasBlobIndex = weightsBlobIndex + hasWeights;
+
+        float varMeanScale = 1;
+        if (!hasWeights && !hasBias) {
+            varMeanScale = *blobs[2].ptr<float>();
+            if (varMeanScale != 0)
+                varMeanScale = 1/varMeanScale;
+        }
+
+        Mat invStdMat;
+        cv::pow(blobs[1]*varMeanScale + epsilon, -0.5, invStdMat);
+
+        int rows = inpBlob.size[2];
+        int cols = inpBlob.size[3];
+
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &outBlob = outputs[ii];
+
+            if (hasWeights)
+                CV_Assert(inpBlob.size[1] == blobs[weightsBlobIndex].total());
+
+            if (hasBias)
+                CV_Assert(inpBlob.size[1] == blobs[biasBlobIndex].total());
+
+            for(int num = 0; num < outBlob.size[0]; num++)
+            {
+                for (int n = 0; n < outBlob.size[1]; n++)
+                {
+                    float mean = blobs[0].at<float>(n)*varMeanScale;
+                    double invstd = invStdMat.at<float>(n);
+                    float w = hasWeights ? blobs[weightsBlobIndex].at<float>(n) : 1;
+                    float b = hasBias ? blobs[biasBlobIndex].at<float>(n) : 0;
+                    Mat inpBlobPlane(rows, cols, CV_32F, inpBlob.ptr<float>(num, n));
+                    Mat outBlobPlane(rows, cols, CV_32F, outBlob.ptr<float>(num, n));
+                    inpBlobPlane.convertTo(outBlobPlane, CV_32F, w*invstd, b - mean*w*invstd);
+                }
+            }
+        }
     }
-}
 
-Ptr<BatchNormLayer> BatchNormLayer::create(bool hasWeights, bool hasBias, float epsilon)
+    bool hasWeights, hasBias;
+    float epsilon;
+};
+
+Ptr<BatchNormLayer> BatchNormLayer::create(const LayerParams& params)
 {
-    return Ptr<BatchNormLayer>(new BatchNormLayerImpl(hasWeights, hasBias, epsilon));
+    return Ptr<BatchNormLayer>(new BatchNormLayerImpl(params));
 }
 
 }  // namespace dnn
diff --git a/modules/dnn/src/layers/batch_norm_layer.hpp b/modules/dnn/src/layers/batch_norm_layer.hpp
deleted file mode 100644
index 1afa01476ef..00000000000
--- a/modules/dnn/src/layers/batch_norm_layer.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2016, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-/*
-Declaration of Batch Normalization layer.
-*/
-
-#ifndef __OPENCV_DNN_LAYERS_BATCH_NORM_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_BATCH_NORM_LAYER_HPP__
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class BatchNormLayerImpl : public BatchNormLayer
-{
-public:
-    BatchNormLayerImpl(bool hasWeights_, bool hasBias_, float epsilon_);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    bool hasWeights, hasBias;
-    float epsilon;
-};
-
-}
-}
-#endif // __OPENCV_DNN_LAYERS_BATCH_NORM_LAYER_HPP__
diff --git a/modules/dnn/src/layers/flatten_layer.hpp b/modules/dnn/src/layers/blank_layer.cpp
similarity index 78%
rename from modules/dnn/src/layers/flatten_layer.hpp
rename to modules/dnn/src/layers/blank_layer.cpp
index 1aab0eb1a82..581b55d2adc 100644
--- a/modules/dnn/src/layers/flatten_layer.hpp
+++ b/modules/dnn/src/layers/blank_layer.cpp
@@ -38,30 +38,35 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-
-#ifndef __OPENCV_DNN_LAYERS_FLATTEN_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_FLATTEN_LAYER_HPP__
 #include "../precomp.hpp"
 
 namespace cv
 {
 namespace dnn
 {
-class FlattenLayer : public Layer
+class BlankLayerImpl : public BlankLayer
 {
-    int _startAxis;
-    int _endAxis;
-    size_t _numAxes;
-
-    BlobShape resultShape;
-
 public:
-    FlattenLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
+    BlankLayerImpl(const LayerParams&) {}
+
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+            outputs[i] = *inputs[i];
+    }
 
-    void checkInputs(const std::vector<Blob*> &inputs);
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        for (size_t i = 0; i < inputs.size(); i++)
+            outputs[i] = *inputs[i];
+    }
 };
+
+Ptr<BlankLayer> BlankLayer::create(const LayerParams& params)
+{
+    return Ptr<BlankLayer>(new BlankLayerImpl(params));
+}
+
 }
 }
-#endif
diff --git a/modules/dnn/src/layers/blank_layer.hpp b/modules/dnn/src/layers/blank_layer.hpp
deleted file mode 100644
index 6d93f278a87..00000000000
--- a/modules/dnn/src/layers/blank_layer.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_BLANK_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_BLANK_LAYER_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-    class BlankLayer : public Layer
-    {
-    public:
-
-        BlankLayer(LayerParams&)
-        {
-
-        }
-
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        {
-            outputs.resize(inputs.size());
-            for (size_t i = 0; i < inputs.size(); i++)
-                outputs[i].shareFrom(*inputs[i]);
-        }
-
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-        {
-            for (size_t i = 0; i < inputs.size(); i++)
-                outputs[i] = *inputs[i];
-        }
-    };
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp
index 61341fefa4b..61bd83c64e0 100644
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -41,80 +41,69 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "concat_layer.hpp"
-#include <opencv2/core/ocl.hpp>
 
 namespace cv
 {
 namespace dnn
 {
 
-ConcatLayerImpl::ConcatLayerImpl(int axis_ /*= 1*/)
+class ConcatLayerImpl : public ConcatLayer
 {
-    axis = axis_;
-}
-
-void ConcatLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() > 0);
-
-    BlobShape refShape = inputs[0]->shape();
-    axisIdx = inputs[0]->canonicalAxis(axis);
+public:
+    ConcatLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 1);
+    }
 
-    int axisSum = 0;
-    useOpenCL = false;
-    for (size_t i = 0; i < inputs.size(); i++)
+    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
-        BlobShape curShape = inputs[i]->shape();
+        CV_Assert(inputs.size() > 0);
 
-        CV_Assert(curShape.dims() == refShape.dims() && inputs[i]->type() == inputs[0]->type());
-        for (int curAxis = 0; curAxis < refShape.dims(); curAxis++)
+        int dims = inputs[0]->dims, dtype = inputs[0]->type();
+        std::vector<int> refShape(inputs[0]->size.p, inputs[0]->size.p + dims);
+        axisIdx = axis < 0 ? axis + dims : axis;
+
+        int axisSum = 0;
+        for (size_t i = 0; i < inputs.size(); i++)
         {
-            if (curAxis != axisIdx && refShape[curAxis] != curShape[curAxis])
-                CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
+            CV_Assert(inputs[i]->type() == dtype);
+            for (int curAxis = 0; curAxis < dims; curAxis++)
+            {
+                if (curAxis != axisIdx && inputs[0]->size[curAxis] != inputs[i]->size[curAxis])
+                    CV_Error(Error::StsBadSize, "Inconsitent shape for ConcatLayer");
+            }
+
+            axisSum += inputs[i]->size[axisIdx];
         }
 
-        axisSum += curShape[axisIdx];
-        useOpenCL |= inputs[i]->getState() == Blob::HEAD_AT_MAT;
-    }
-
-    refShape[axisIdx] = axisSum;
-    useOpenCL &= ocl::useOpenCL();
-    int allocFlags = (useOpenCL) ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
-
-    outputs.resize(1);
-    outputs[0].create(refShape, inputs[0]->type(), allocFlags);
-}
+        refShape[axisIdx] = axisSum;
 
+        outputs.resize(1);
+        outputs[0].create(dims, &refShape[0], dtype);
+    }
 
-void ConcatLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    #ifdef HAVE_OPENCL
-    if (useOpenCL)
-        forward_<UMat>(inputs, outputs);
-    else
-    #endif
-        forward_<Mat>(inputs, outputs);
-}
-
-template<typename XMat>
-void ConcatLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    XMat& outMat = outputs[0].getRef<XMat>();
-    std::vector<Range> ranges(outputs[0].dims(), Range::all());
 
-    ranges[axisIdx].start = 0;
-    for (size_t i = 0; i < inputs.size(); i++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        ranges[axisIdx].end = ranges[axisIdx].start + inputs[i]->size(axisIdx);
-        inputs[i]->getRefConst<XMat>().copyTo(outMat(&ranges[0]));
-        ranges[axisIdx].start = ranges[axisIdx].end;
+        Mat& outMat = outputs[0];
+        std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+        ranges[axisIdx].start = 0;
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            ranges[axisIdx].end = ranges[axisIdx].start + inputs[i]->size[axisIdx];
+            inputs[i]->copyTo(outMat(&ranges[0]));
+            ranges[axisIdx].start = ranges[axisIdx].end;
+        }
     }
-}
 
-Ptr<ConcatLayer> ConcatLayer::create(int axis)
+    int axisIdx;
+};
+
+Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
 {
-    return Ptr<ConcatLayer>(new ConcatLayerImpl(axis));
+    return Ptr<ConcatLayer>(new ConcatLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/concat_layer.hpp b/modules/dnn/src/layers/concat_layer.hpp
deleted file mode 100644
index 86f2083b942..00000000000
--- a/modules/dnn/src/layers/concat_layer.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_CONCAT_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_CONCAT_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class ConcatLayerImpl : public ConcatLayer
-{
-    bool useOpenCL;
-    int axisIdx;
-
-    template<typename XMat>
-    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-public:
-    ConcatLayerImpl(int axis_ = 1);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 48d12a35410..5ce08646ab8 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -40,9 +40,7 @@
 //M*/
 
 #include "../precomp.hpp"
-#include <opencv2/core/ocl.hpp>
 #include "layers_common.hpp"
-#include "convolution_layer.hpp"
 #include "op_im2col.hpp"
 #include "op_blas.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
@@ -53,12 +51,54 @@ namespace cv
 namespace dnn
 {
 
+class BaseConvolutionLayerImpl : public ConvolutionLayer
+{
+public:
+    BaseConvolutionLayerImpl();
+    virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
+
+    void init();
+    virtual void computeInpOutShape(const Mat &inpBlob) = 0;
+    bool is1x1() const;
+
+    int numOutput, group;
+    int inpH, inpW, inpCn;
+    int outH, outW, outCn;
+    int inpGroupCn, outGroupCn;
+    int ksize;
+    std::vector<int> colRowBlobShape;
+
+    bool bias;
+    Mat colRowBlob, biasOnesBlob;
+};
+
+//TODO: simultaneously convolution and bias addition for cache optimization
+class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
+{
+public:
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
+    virtual void computeInpOutShape(const Mat &inpBlob);
+
+    void im2col(const  Mat &srcImg,  Mat &dstCol);
+    void im2row(const  Mat &srcImg,  Mat &dstRow);
+};
+
+class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
+{
+public:
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs);
+
+    virtual void computeInpOutShape(const Mat &inpBlob);
+    void col2im(const  Mat &colMat, Mat  &dstImg);
+};
+
+
 BaseConvolutionLayerImpl::BaseConvolutionLayerImpl():
     numOutput(-1), group(-1),
     inpH(0), inpW(0), inpCn(0),
     outH(0), outW(0), outCn(0),
     inpGroupCn(0), outGroupCn(0),
-    ksize(0), bias(false), tryUseOpenCL(false)
+    ksize(0), bias(false)
 {
 #ifdef HAVE_LAPACK
     if (getBlasThreads() != cv::getThreadNum())
@@ -71,46 +111,44 @@ BaseConvolutionLayerImpl::BaseConvolutionLayerImpl():
 void BaseConvolutionLayerImpl::init()
 {
     CV_Assert(blobs.size() >= 1 && blobs.size() <= 2);
-    CV_Assert(blobs[0].dims() == 4 && blobs[0].cols() == kernel.width && blobs[0].rows() == kernel.height);
+    CV_Assert(blobs[0].dims == 4 && blobs[0].size[3] == kernel.width && blobs[0].size[2] == kernel.height);
 
     bias = (blobs.size() >= 2);
-    useOpenCL = ocl::useOpenCL() && tryUseOpenCL && dilation == Size(1, 1);
 }
 
-void BaseConvolutionLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void BaseConvolutionLayerImpl::allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
 {
     CV_Assert(inputs.size() > 0);
 
     init();
 
-    const Blob &input = *inputs[0];
-    CV_Assert(input.dims() == 4 && (input.type() == CV_32F || input.type() == CV_64F));
+    const Mat &input = *inputs[0];
+    CV_Assert(input.dims == 4 && (input.type() == CV_32F || input.type() == CV_64F));
     for (size_t i = 0; i < inputs.size(); i++)
     {
         CV_Assert(inputs[i]->type() == input.type());
-        CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == input.channels());
-        CV_Assert(inputs[i]->rows() == input.rows() && inputs[i]->cols() == input.cols());
+        CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == input.size[1]);
+        CV_Assert(inputs[i]->size[2] == input.size[2] && inputs[i]->size[3] == input.size[3]);
     }
 
     computeInpOutShape(input);
 
-    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
-
     if (bias)
     {
-        biasOnesBlob.create(Shape(1, outH * outW), input.type(), allocFlags);
+        biasOnesBlob.create(1, outH * outW, input.type());
         biasOnesBlob.setTo(1);
     }
 
     outputs.resize(inputs.size());
     for (size_t i = 0; i < inputs.size(); i++)
     {
-        outputs[i].create(Shape(inputs[i]->num(), outCn, outH, outW), input.type(), allocFlags);
+        int sz[] = { inputs[i]->size[0], outCn, outH, outW };
+        outputs[i].create(4, sz, input.type());
     }
 
     if (!is1x1())
     {
-        colRowBlob.create(colRowBlobShape, input.type(), allocFlags);
+        colRowBlob.create((int)colRowBlobShape.size(), &colRowBlobShape[0], input.type());
         colRowBlob.setTo(0);
     }
 }
@@ -122,15 +160,15 @@ bool BaseConvolutionLayerImpl::is1x1() const
            (dilation.height == 1 && dilation.width == 1);
 }
 
-void ConvolutionLayerImpl::computeInpOutShape(const Blob &input)
+void ConvolutionLayerImpl::computeInpOutShape(const Mat &input)
 {
-    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].num());
+    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
 
-    numOutput = blobs[0].num();
+    numOutput = blobs[0].size[0];
 
-    inpH = input.rows();
-    inpW = input.cols();
-    inpCn = input.channels();
+    inpH = input.size[2];
+    inpW = input.size[3];
+    inpCn = input.size[1];
     outCn = numOutput;
 
     if (padMode.empty())
@@ -143,90 +181,67 @@ void ConvolutionLayerImpl::computeInpOutShape(const Blob &input)
         getConvPoolOutParams(inpH, inpW, kernel, stride, pad, padMode, outH, outW);
     }
 
-    group = inpCn / blobs[0].channels();
+    group = inpCn / blobs[0].size[1];
 
     CV_Assert(inpCn % group == 0 && outCn % group == 0);
-    CV_Assert(blobs[0].num() == outCn && blobs[0].channels() == inpCn / group);
+    CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
 
     outGroupCn = outCn / group;
     inpGroupCn = inpCn / group;
     ksize = inpGroupCn * kernel.height * kernel.width;
 
-    colRowBlobShape = BlobShape(outH * outW, ksize);
+    colRowBlobShape.clear();
+    colRowBlobShape.push_back(outH*outW);
+    colRowBlobShape.push_back(ksize);
 }
 
-template<typename XMat>
-void ConvolutionLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void ConvolutionLayerImpl::forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
 {
     CV_Assert(inputs.size() > 0);
 
-    XMat weightsMat = reshaped(blobs[0].getRefConst<XMat>(), Shape(outCn, ksize));
-    XMat biasesMat  = (bias) ? reshaped(blobs[1].getRefConst<XMat>(), Shape(outCn, 1)) : XMat();
+    Mat weightsMat = blobs[0].reshape(1, outCn);
+    Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
 
     for (size_t ii = 0; ii < outputs.size(); ii++)
     {
-        int numImg = inputs[ii]->size(0);
-        XMat inpMat = inputs[ii]->getRefConst<XMat>();
-        XMat outMat = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*group*outGroupCn, outH*outW));
+        int numImg = inputs[ii]->size[0];
+        Mat inpMat = *inputs[ii];
+        Mat outMat = outputs[ii].reshape(1, numImg*group*outGroupCn);
 
         for (int n = 0; n < numImg; n++)
         {
             for (int g = 0; g < group; g++)
             {
-                XMat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
+                Mat colMat, curInp = slice(inpMat, n, _Range(g * inpGroupCn, inpGroupCn));
 
                 im2row(curInp, colMat);
 
                 _Range kerRange(g * outGroupCn, outGroupCn);
-                XMat kerMat = weightsMat.rowRange(kerRange);
+                Mat kerMat = weightsMat.rowRange(kerRange);
 
                 _Range outRange((g + n * group) * outGroupCn, outGroupCn);
-                XMat dstMat = outMat.rowRange(outRange);
+                Mat dstMat = outMat.rowRange(outRange);
 
                 dnn::gemm(kerMat, colMat, 1, dstMat, 0, GEMM_2_T);
 
                 if (bias)
                 {
-                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
+                    dnn::gemm(biasesMat.rowRange(kerRange), biasOnesBlob, 1, dstMat, 1);
                 }
             }
         }
     }
 }
 
-void ConvolutionLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    if (!useOpenCL)
-        forward_<Mat>(inputs, outputs);
-    else
-        forward_<UMat>(inputs, outputs);
-}
-
-void ConvolutionLayerImpl::im2col(const UMat &srcImg, UMat &dstCol)
-{
-    if (is1x1())
-    {
-        dstCol = reshaped(srcImg, Shape(ksize, outH*outW));
-        return;
-    }
-#ifdef HAVE_OPENCL
-    CV_Assert(im2col_ocl(srcImg, inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dilation.height, dilation.width, this->colRowBlob.umatRef()));
-    dstCol = this->colRowBlob.umatRefConst();
-#else
-    CV_Error(Error::StsInternal, "");
-    dstCol = srcImg; //supress warning
-#endif
-}
-
 void ConvolutionLayerImpl::im2col(const Mat &srcImg, Mat &dstCol)
 {
     if (is1x1())
     {
-        dstCol = reshaped(srcImg, Shape(ksize, outH*outW));
+        dstCol = srcImg.reshape(1, ksize);
         return;
     }
 
-    Mat &colMat = colRowBlob.matRef();
+    Mat &colMat = colRowBlob;
     if (srcImg.type() == CV_32F)
         im2col_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
                                     kernel.width, pad.height, pad.width, stride.height, stride.width,
@@ -243,11 +258,11 @@ void ConvolutionLayerImpl::im2row(const  Mat &srcImg,  Mat &dstRow)
 {
     if (is1x1())
     {
-        dstRow = reshaped(srcImg, Shape(ksize, outH*outW)).t();
+        dstRow = srcImg.reshape(1, ksize).t();
         return;
     }
 
-    Mat &colMat = colRowBlob.matRef();
+    Mat &colMat = colRowBlob;
     if (srcImg.type() == CV_32F)
         im2row_CpuPBody<float>::run(srcImg.ptr<float>(), inpGroupCn, inpH, inpW, kernel.height,
                                     kernel.width, pad.height, pad.width, stride.height, stride.width,
@@ -260,67 +275,55 @@ void ConvolutionLayerImpl::im2row(const  Mat &srcImg,  Mat &dstRow)
     dstRow = colMat;
 }
 
-void ConvolutionLayerImpl::im2row(const UMat &srcImg, UMat &dstCol)
-{
-    CV_Error(cv::Error::StsNotImplemented, "");
-}
-
 //Deconvolution
 
-void DeConvolutionLayerImpl::computeInpOutShape(const Blob &inpBlob)
+void DeConvolutionLayerImpl::computeInpOutShape(const Mat &inpBlob)
 {
-    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].num());
+    CV_Assert(!bias || blobs[1].total() == (size_t)blobs[0].size[0]);
 
-    numOutput = blobs[0].num();
+    numOutput = blobs[0].size[0];
 
-    inpH = inpBlob.rows();
-    inpW = inpBlob.cols();
-    inpCn = inpBlob.channels();
+    inpH = inpBlob.size[2];
+    inpW = inpBlob.size[3];
+    inpCn = inpBlob.size[1];
 
     outH = stride.height * (inpH - 1) + kernel.height - 2 * pad.height + adjustPad.height;
     outW = stride.width * (inpW - 1) + kernel.width - 2 * pad.width + adjustPad.width;
     outCn = numOutput;
 
-    group = inpCn / blobs[0].channels();
+    group = inpCn / blobs[0].size[1];
     outGroupCn = outCn / group;
     inpGroupCn = inpCn / group;
     ksize = outGroupCn * kernel.height * kernel.width;
 
     CV_Assert(inpCn % group == 0 && outCn % group == 0);
-    CV_Assert(blobs[0].num() == outCn && blobs[0].channels() == inpCn / group);
+    CV_Assert(blobs[0].size[0] == outCn && blobs[0].size[1] == inpCn / group);
 
-    colRowBlobShape = BlobShape(ksize, inpH * inpW);
+    colRowBlobShape.clear();
+    colRowBlobShape.push_back(ksize);
+    colRowBlobShape.push_back(inpH * inpW);
 }
 
-void DeConvolutionLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+void DeConvolutionLayerImpl::forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
 {
-    if (!useOpenCL)
-        forward_<Mat>(inputs, outputs);
-    else
-        forward_<UMat>(inputs, outputs);
-}
-
-template<typename XMat>
-void DeConvolutionLayerImpl::forward_(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    XMat weightsMat = reshaped(blobs[0].getRefConst<XMat>(), Shape(inpCn, ksize));
-    XMat biasesMat  = (bias) ? reshaped(blobs[1].getRefConst<XMat>(), Shape(outCn, 1)) : XMat();
+    Mat weightsMat = blobs[0].reshape(1, inpCn);
+    Mat biasesMat  = bias ? blobs[1].reshape(1, outCn) : Mat();
 
     for (size_t ii = 0; ii < outputs.size(); ii++)
     {
-        int numImg = inputs[ii]->size(0);
-        XMat convBlob = reshaped(inputs[ii]->getRefConst<XMat>(), Shape(numImg*inpCn, inpH*inpW));
-        XMat decnBlob = reshaped(outputs[ii].getRef<XMat>(), Shape(numImg*outCn, outH*outW));
+        int numImg = inputs[ii]->size[0];
+        Mat convBlob = inputs[ii]->reshape(1, numImg*inpCn);
+        Mat decnBlob = outputs[ii].reshape(1, numImg*outCn);
 
         for (int n = 0; n < numImg; n++)
         {
             for (int g = 0; g < group; g++)
             {
-                XMat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
-                XMat &colMat = (is1x1()) ? dstMat : colRowBlob.getRef<XMat>();
+                Mat dstMat = decnBlob.rowRange(_Range((g + n * group) * outGroupCn, outGroupCn));
+                Mat &colMat = (is1x1()) ? dstMat : colRowBlob;
 
-                XMat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
-                XMat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
+                Mat convMat = convBlob.rowRange(_Range((g + n * group) * inpGroupCn, inpGroupCn));
+                Mat wghtMat = weightsMat.rowRange(_Range(g * inpGroupCn, inpGroupCn));
 
                 dnn::gemm(wghtMat, convMat, 1, colMat, 0, GEMM_1_T);
 
@@ -329,8 +332,8 @@ void DeConvolutionLayerImpl::forward_(std::vector<Blob *> &inputs, std::vector<B
 
                 if (bias)
                 {
-                    XMat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
-                    dnn::gemm(curBiasMat, biasOnesBlob.getRefConst<XMat>(), 1, dstMat, 1);
+                    Mat curBiasMat = biasesMat.rowRange(_Range(g * outGroupCn, outGroupCn));
+                    dnn::gemm(curBiasMat, biasOnesBlob, 1, dstMat, 1);
                 }
             }
         }
@@ -350,24 +353,9 @@ void DeConvolutionLayerImpl::col2im(const Mat &colMat, Mat &dstImg)
         col2im_CpuPBody<double>::run(colMat.ptr<double>(), inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg.ptr<double>());
 }
 
-void DeConvolutionLayerImpl::col2im(const UMat &colMat, UMat &dstImg)
-{
-    if (is1x1())
-    {
-        dstImg = colMat;
-        return;
-    }
-#ifdef HAVE_OPENCL
-    CV_Assert(col2im_ocl(colMat, inpGroupCn, inpH, inpW, kernel.height, kernel.width, pad.height, pad.width, stride.height, stride.width, dstImg));
-#else
-    CV_Error(Error::StsInternal, "");
-    dstImg = colMat;
-#endif
-}
-
 //Initializers
 
-Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation)
+/*Ptr<BaseConvolutionLayer> ConvolutionLayer::create(Size kernel, Size stride, Size pad, Size dilation)
 {
     ConvolutionLayerImpl *l = new ConvolutionLayerImpl();
     l->kernel = kernel;
@@ -387,6 +375,40 @@ Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(Size kernel, Size stride, S
     l->adjustPad = adjustPad;
 
     return Ptr<BaseConvolutionLayer>(l);
+}*/
+
+//Convolution and Deconvolution
+static void initConvDeconvLayerFromCaffe(Ptr<BaseConvolutionLayer> l, const LayerParams &params)
+{
+    l->setParamsFrom(params);
+    getConvolutionKernelParams(params, l->kernel.height, l->kernel.width, l->pad.height,
+                               l->pad.width, l->stride.height, l->stride.width, l->dilation.height,
+                               l->dilation.width, l->padMode);
+
+    bool bias = params.get<bool>("bias_term", true);
+    int numOutput = params.get<int>("num_output");
+    int group = params.get<int>("group", 1);
+
+    l->adjustPad.height = params.get<int>("adj_h", 0);
+    l->adjustPad.width = params.get<int>("adj_w", 0);
+
+    CV_Assert(numOutput % group == 0);
+    CV_Assert((bias && l->blobs.size() == 2) || (!bias && l->blobs.size() == 1));
+}
+
+Ptr<BaseConvolutionLayer> ConvolutionLayer::create(const LayerParams &params)
+{
+    Ptr<BaseConvolutionLayer> l(new ConvolutionLayerImpl);
+    initConvDeconvLayerFromCaffe(l, params);
+    return l;
+}
+
+Ptr<BaseConvolutionLayer> DeconvolutionLayer::create(const LayerParams &params)
+{
+    Ptr<BaseConvolutionLayer> l(new DeConvolutionLayerImpl);
+    initConvDeconvLayerFromCaffe(l, params);
+
+    return l;
 }
 
 }
diff --git a/modules/dnn/src/layers/convolution_layer.hpp b/modules/dnn/src/layers/convolution_layer.hpp
deleted file mode 100644
index de2b0ab7811..00000000000
--- a/modules/dnn/src/layers/convolution_layer.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_CONVOLUTION_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class BaseConvolutionLayerImpl : public ConvolutionLayer
-{
-public:
-    BaseConvolutionLayerImpl();
-    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-protected:
-    void init();
-    virtual void computeInpOutShape(const Blob &inpBlob) = 0;
-    bool is1x1() const;
-
-    int numOutput, group;
-    int inpH, inpW, inpCn;
-    int outH, outW, outCn;
-    int inpGroupCn, outGroupCn;
-    int ksize;
-    BlobShape colRowBlobShape;
-
-    bool bias;
-    bool tryUseOpenCL, useOpenCL;
-
-    Blob colRowBlob, biasOnesBlob;
-
-};
-
-//TODO: simultaneously convolution and bias addition for cache optimization
-class ConvolutionLayerImpl : public BaseConvolutionLayerImpl
-{
-public:
-    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-protected:
-    virtual void computeInpOutShape(const Blob &inpBlob);
-
-    template<typename XMat>
-    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void im2col(const  Mat &srcImg,  Mat &dstCol);
-    void im2row(const  Mat &srcImg,  Mat &dstRow);
-    void im2col(const UMat &srcImg, UMat &dstCol);
-    void im2row(const UMat &srcImg, UMat &dstCol);
-};
-
-class DeConvolutionLayerImpl : public BaseConvolutionLayerImpl
-{
-public:
-    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-protected:
-
-    virtual void computeInpOutShape(const Blob &inpBlob);
-
-    template<typename XMat>
-    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void col2im(const  Mat &colMat, Mat  &dstImg);
-    void col2im(const UMat &colMat, UMat &dstImg);
-};
-
-//Importers
-Ptr<Layer> createConvolutionLayerFromCaffe(LayerParams &params);
-Ptr<Layer> createDeconvolutionLayerFromCaffe(LayerParams &params);
-
-}
-}
-
-#endif
diff --git a/modules/dnn/src/layers/crop_layer.cpp b/modules/dnn/src/layers/crop_layer.cpp
index 06f6f75b3a5..422144c6c66 100755
--- a/modules/dnn/src/layers/crop_layer.cpp
+++ b/modules/dnn/src/layers/crop_layer.cpp
@@ -41,87 +41,97 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "crop_layer.hpp"
 
 namespace cv
 {
 namespace dnn
 {
 
-CropLayerImpl::CropLayerImpl(int start_axis_, const std::vector<int> &offset_)
+class CropLayerImpl : public CropLayer
 {
-    startAxis = start_axis_;
-    offset = offset_;
-}
-
-void CropLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(2 == inputs.size());
-
-    const Blob &inpBlob = *inputs[0];
-    const Blob &inpSzBlob = *inputs[1];
-
-    int start_axis = inpBlob.canonicalAxis(startAxis);
-    int dims = inpBlob.dims();
-
-    std::vector<int> offset_final(dims, 0);
-    if (offset.size() == 1)
+public:
+    CropLayerImpl(const LayerParams& params)
     {
-        for (int i = start_axis; i < dims; i++)
-            offset_final[i] = offset[0];
-    }
-    else if (offset.size() > 1)
-    {
-        if ((int)offset.size() != dims - start_axis)
-            CV_Error(Error::StsBadArg, "number of offset values specified must be equal to the number of dimensions following axis.");
+        setParamsFrom(params);
+        startAxis = params.get<int>("axis", 2);
+        const DictValue *paramOffset = params.ptr("offset");
 
-        for (int i = start_axis; i < dims; i++)
-            offset_final[i] = offset[i - start_axis];
+        if (paramOffset)
+        {
+            for (int i = 0; i < paramOffset->size(); i++)
+                offset.push_back(paramOffset->get<int>(i));
+        }
     }
 
-    BlobShape dstShape = inpBlob.shape();
-    crop_ranges.resize(dims, Range::all());
-    for (int i = start_axis; i < dims; i++)
+    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
-        dstShape[i] = inpSzBlob.size(i);
+        CV_Assert(2 == inputs.size());
 
-        if (!offset.empty()) //normal case
-        {
-            if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size(i) > inpBlob.size(i))
-                CV_Error(Error::StsBadArg, "invalid crop parameters");
+        const Mat &inpBlob = *inputs[0];
+        const Mat &inpSzBlob = *inputs[1];
+
+        int dims = inpBlob.dims;
+        int start_axis = startAxis < 0 ? startAxis + dims : startAxis;
 
-            crop_ranges[i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size(i));
+        std::vector<int> offset_final(dims, 0);
+        if (offset.size() == 1)
+        {
+            for (int i = start_axis; i < dims; i++)
+                offset_final[i] = offset[0];
         }
-        else //detect offset automatically so that cropped image is center of original one
+        else if (offset.size() > 1)
         {
-            if (inpSzBlob.size(i) > inpBlob.size(i))
-                CV_Error(Error::StsBadArg, "invalid output blob size");
+            if ((int)offset.size() != dims - start_axis)
+                CV_Error(Error::StsBadArg, "number of offset values specified must be equal to the number of dimensions following axis.");
+
+            for (int i = start_axis; i < dims; i++)
+                offset_final[i] = offset[i - start_axis];
+        }
 
-            int cur_crop = (inpBlob.size(i) - inpSzBlob.size(i)) / 2;
-            crop_ranges[i] = Range(cur_crop, cur_crop + inpSzBlob.size(i));
+        std::vector<int> dstShape(dims);
+        crop_ranges.resize(dims, Range::all());
+        for (int i = 0; i < dims; i++)
+        {
+            dstShape[i] = inpSzBlob.size[i];
+            if( i < start_axis )
+                continue;
+
+            if (!offset.empty()) //normal case
+            {
+                if (offset_final[i] < 0 || offset_final[i] + inpSzBlob.size[i] > inpBlob.size[i])
+                    CV_Error(Error::StsBadArg, "invalid crop parameters");
+
+                crop_ranges[i] = Range(offset_final[i], offset_final[i] + inpSzBlob.size[i]);
+            }
+            else //detect offset automatically so that cropped image is center of original one
+            {
+                if (inpSzBlob.size[i] > inpBlob.size[i])
+                    CV_Error(Error::StsBadArg, "invalid output blob size");
+
+                int cur_crop = (inpBlob.size[i] - inpSzBlob.size[i]) / 2;
+                crop_ranges[i] = Range(cur_crop, cur_crop + inpSzBlob.size[i]);
+            }
         }
+
+        outputs.resize(1);
+        outputs[0].create(dims, &dstShape[0], inpBlob.type());
     }
 
-    outputs.resize(1);
-    outputs[0].create(dstShape);
-}
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
+    {
+        Mat &input = *inputs[0];
+        Mat &output = outputs[0];
+
+        input(&crop_ranges[0]).copyTo(output);
+    }
+
+    std::vector<Range> crop_ranges;
+};
 
-void CropLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    Blob &input = *inputs[0];
-    Blob &output = outputs[0];
-
-    #ifdef HAVE_OPENCL
-    if (input.getState() == Blob::HEAD_AT_UMAT)
-        input.umatRefConst()(&crop_ranges[0]).copyTo(output.umatRef());
-    else
-    #endif
-        input.matRefConst()(&crop_ranges[0]).copyTo(output.matRef());
-}
 
-Ptr<CropLayer> CropLayer::create(int start_axis, const std::vector<int> &offset)
+Ptr<CropLayer> CropLayer::create(const LayerParams& params)
 {
-    return Ptr<CropLayer>(new CropLayerImpl(start_axis, offset));
+    return Ptr<CropLayer>(new CropLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/crop_layer.hpp b/modules/dnn/src/layers/crop_layer.hpp
deleted file mode 100755
index bc8789b9054..00000000000
--- a/modules/dnn/src/layers/crop_layer.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_CROP_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_CROP_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-    class CropLayerImpl : public CropLayer
-    {
-        std::vector<Range> crop_ranges;
-
-    public:
-        CropLayerImpl(int start_axis, const std::vector<int> &offset);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/detection_output_layer.cpp b/modules/dnn/src/layers/detection_output_layer.cpp
index 00002dbb4bc..bcffda41180 100644
--- a/modules/dnn/src/layers/detection_output_layer.cpp
+++ b/modules/dnn/src/layers/detection_output_layer.cpp
@@ -41,9 +41,9 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "detection_output_layer.hpp"
 #include <float.h>
 #include <string>
+#include <caffe.pb.h>
 
 namespace cv
 {
@@ -52,6 +52,7 @@ namespace dnn
 
 namespace util
 {
+
 template <typename T>
 std::string to_string(T value)
 {
@@ -74,676 +75,742 @@ bool SortScorePairDescend(const std::pair<float, T>& pair1,
 {
     return pair1.first > pair2.first;
 }
-}
 
-const std::string DetectionOutputLayer::_layerName = std::string("DetectionOutput");
+}
 
-bool DetectionOutputLayer::getParameterDict(const LayerParams &params,
-                                    const std::string &parameterName,
-                                    DictValue& result)
+class DetectionOutputLayerImpl : public DetectionOutputLayer
 {
-    if (!params.has(parameterName))
-    {
-        return false;
-    }
+public:
+    unsigned _numClasses;
+    bool _shareLocation;
+    int _numLocClasses;
 
-    result = params.get(parameterName);
-    return true;
-}
+    int _backgroundLabelId;
 
-template<typename T>
-T DetectionOutputLayer::getParameter(const LayerParams &params,
-                             const std::string &parameterName,
-                             const size_t &idx,
-                             const bool required,
-                             const T& defaultValue)
-{
-    DictValue dictValue;
-    bool success = getParameterDict(params, parameterName, dictValue);
-    if(!success)
+    typedef caffe::PriorBoxParameter_CodeType CodeType;
+    CodeType _codeType;
+
+    bool _varianceEncodedInTarget;
+    int _keepTopK;
+    float _confidenceThreshold;
+
+    int _num;
+    int _numPriors;
+
+    float _nmsThreshold;
+    int _topK;
+
+    enum { _numAxes = 4 };
+    static const std::string _layerName;
+
+    typedef std::map<int, std::vector<caffe::NormalizedBBox> > LabelBBox;
+
+    bool getParameterDict(const LayerParams &params,
+                          const std::string &parameterName,
+                          DictValue& result)
     {
-        if(required)
+        if (!params.has(parameterName))
         {
-            std::string message = _layerName;
-            message += " layer parameter does not contain ";
-            message += parameterName;
-            message += " parameter.";
-            CV_Error(Error::StsBadArg, message);
+            return false;
         }
-        else
+
+        result = params.get(parameterName);
+        return true;
+    }
+
+    template<typename T>
+    T getParameter(const LayerParams &params,
+                   const std::string &parameterName,
+                   const size_t &idx=0,
+                   const bool required=true,
+                   const T& defaultValue=T())
+    {
+        DictValue dictValue;
+        bool success = getParameterDict(params, parameterName, dictValue);
+        if(!success)
         {
-            return defaultValue;
+            if(required)
+            {
+                std::string message = _layerName;
+                message += " layer parameter does not contain ";
+                message += parameterName;
+                message += " parameter.";
+                CV_Error(Error::StsBadArg, message);
+            }
+            else
+            {
+                return defaultValue;
+            }
         }
+        return dictValue.get<T>(idx);
     }
-    return dictValue.get<T>(idx);
-}
 
-void DetectionOutputLayer::getCodeType(LayerParams &params)
-{
-    String codeTypeString = params.get<String>("code_type").toLowerCase();
-    if (codeTypeString == "corner")
-        _codeType = caffe::PriorBoxParameter_CodeType_CORNER;
-    else if (codeTypeString == "center_size")
-        _codeType = caffe::PriorBoxParameter_CodeType_CENTER_SIZE;
-    else
-        _codeType = caffe::PriorBoxParameter_CodeType_CORNER;
-}
+    void getCodeType(const LayerParams &params)
+    {
+        String codeTypeString = params.get<String>("code_type").toLowerCase();
+        if (codeTypeString == "corner")
+            _codeType = caffe::PriorBoxParameter_CodeType_CORNER;
+        else if (codeTypeString == "center_size")
+            _codeType = caffe::PriorBoxParameter_CodeType_CENTER_SIZE;
+        else
+            _codeType = caffe::PriorBoxParameter_CodeType_CORNER;
+    }
 
-DetectionOutputLayer::DetectionOutputLayer(LayerParams &params) : Layer(params)
-{
-    _numClasses = getParameter<unsigned>(params, "num_classes");
-    _shareLocation = getParameter<bool>(params, "share_location");
-    _numLocClasses = _shareLocation ? 1 : _numClasses;
-    _backgroundLabelId = getParameter<int>(params, "background_label_id");
-    _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
-    _keepTopK = getParameter<int>(params, "keep_top_k");
-    _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
-    _topK = getParameter<int>(params, "top_k", 0, false, -1);
-
-    getCodeType(params);
-
-    // Parameters used in nms.
-    _nmsThreshold = getParameter<float>(params, "nms_threshold");
-    CV_Assert(_nmsThreshold > 0.);
-}
+    DetectionOutputLayerImpl(const LayerParams &params)
+    {
+        _numClasses = getParameter<unsigned>(params, "num_classes");
+        _shareLocation = getParameter<bool>(params, "share_location");
+        _numLocClasses = _shareLocation ? 1 : _numClasses;
+        _backgroundLabelId = getParameter<int>(params, "background_label_id");
+        _varianceEncodedInTarget = getParameter<bool>(params, "variance_encoded_in_target", 0, false, false);
+        _keepTopK = getParameter<int>(params, "keep_top_k");
+        _confidenceThreshold = getParameter<float>(params, "confidence_threshold", 0, false, -FLT_MAX);
+        _topK = getParameter<int>(params, "top_k", 0, false, -1);
+
+        getCodeType(params);
+
+        // Parameters used in nms.
+        _nmsThreshold = getParameter<float>(params, "nms_threshold");
+        CV_Assert(_nmsThreshold > 0.);
+
+        setParamsFrom(params);
+    }
 
-void DetectionOutputLayer::checkInputs(const std::vector<Blob*> &inputs)
-{
-    for (size_t i = 1; i < inputs.size(); i++)
+    void checkInputs(const std::vector<Mat*> &inputs)
     {
-        for (size_t j = 0; j < _numAxes; j++)
+        for (size_t i = 1; i < inputs.size(); i++)
         {
-            CV_Assert(inputs[i]->shape()[j] == inputs[0]->shape()[j]);
+            CV_Assert(inputs[i]->size == inputs[0]->size);
         }
     }
-}
 
-void DetectionOutputLayer::allocate(const std::vector<Blob*> &inputs,
-                                    std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() > 0);
-    CV_Assert(inputs[0]->num() == inputs[1]->num());
-    _num = inputs[0]->num();
-
-    _numPriors = inputs[2]->rows() / 4;
-    CV_Assert((_numPriors * _numLocClasses * 4) == inputs[0]->channels());
-    CV_Assert(int(_numPriors * _numClasses) == inputs[1]->channels());
-
-    // num() and channels() are 1.
-    // Since the number of bboxes to be kept is unknown before nms, we manually
-    // set it to (fake) 1.
-    // Each row is a 7 dimension std::vector, which stores
-    // [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    BlobShape outputShape = BlobShape(1, 1, 1, 7);
-    outputs[0].create(BlobShape(outputShape));
-}
+    void allocate(const std::vector<Mat*> &inputs,
+                                        std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() > 0);
+        CV_Assert(inputs[0]->size[0] == inputs[1]->size[0]);
+        _num = inputs[0]->size[0];
+
+        _numPriors = inputs[2]->size[2] / 4;
+        CV_Assert((_numPriors * _numLocClasses * 4) == inputs[0]->size[1]);
+        CV_Assert(int(_numPriors * _numClasses) == inputs[1]->size[1]);
+
+        // num() and channels() are 1.
+        // Since the number of bboxes to be kept is unknown before nms, we manually
+        // set it to (fake) 1.
+        // Each row is a 7 dimension std::vector, which stores
+        // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+        int outputShape[] = {1, 1, 1, 7};
+        outputs[0].create(4, outputShape, CV_32F);
+    }
 
-void DetectionOutputLayer::forward(std::vector<Blob*> &inputs,
-                                   std::vector<Blob> &outputs)
-{
-    const float* locationData = inputs[0]->ptrf();
-    const float* confidenceData = inputs[1]->ptrf();
-    const float* priorData = inputs[2]->ptrf();
-
-    // Retrieve all location predictions.
-    std::vector<LabelBBox> allLocationPredictions;
-    GetLocPredictions(locationData, _num, _numPriors, _numLocClasses,
-                      _shareLocation, &allLocationPredictions);
-
-    // Retrieve all confidences.
-    std::vector<std::map<int, std::vector<float> > > allConfidenceScores;
-    GetConfidenceScores(confidenceData, _num, _numPriors, _numClasses,
-                        &allConfidenceScores);
-
-    // Retrieve all prior bboxes. It is same within a batch since we assume all
-    // images in a batch are of same dimension.
-    std::vector<caffe::NormalizedBBox> priorBBoxes;
-    std::vector<std::vector<float> > priorVariances;
-    GetPriorBBoxes(priorData, _numPriors, &priorBBoxes, &priorVariances);
-
-    // Decode all loc predictions to bboxes.
-    std::vector<LabelBBox> allDecodedBBoxes;
-    DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, _num,
-                    _shareLocation, _numLocClasses, _backgroundLabelId,
-                    _codeType, _varianceEncodedInTarget, &allDecodedBBoxes);
-
-    int numKept = 0;
-    std::vector<std::map<int, std::vector<int> > > allIndices;
-    for (int i = 0; i < _num; ++i)
+    void forward(std::vector<Mat*> &inputs,
+                                       std::vector<Mat> &outputs)
     {
-        const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
-        const std::map<int, std::vector<float> >& confidenceScores =
-            allConfidenceScores[i];
-        std::map<int, std::vector<int> > indices;
-        int numDetections = 0;
-        for (int c = 0; c < (int)_numClasses; ++c)
+        const float* locationData = inputs[0]->ptr<float>();
+        const float* confidenceData = inputs[1]->ptr<float>();
+        const float* priorData = inputs[2]->ptr<float>();
+
+        // Retrieve all location predictions.
+        std::vector<LabelBBox> allLocationPredictions;
+        GetLocPredictions(locationData, _num, _numPriors, _numLocClasses,
+                          _shareLocation, &allLocationPredictions);
+
+        // Retrieve all confidences.
+        std::vector<std::map<int, std::vector<float> > > allConfidenceScores;
+        GetConfidenceScores(confidenceData, _num, _numPriors, _numClasses,
+                            &allConfidenceScores);
+
+        // Retrieve all prior bboxes. It is same within a batch since we assume all
+        // images in a batch are of same dimension.
+        std::vector<caffe::NormalizedBBox> priorBBoxes;
+        std::vector<std::vector<float> > priorVariances;
+        GetPriorBBoxes(priorData, _numPriors, &priorBBoxes, &priorVariances);
+
+        // Decode all loc predictions to bboxes.
+        std::vector<LabelBBox> allDecodedBBoxes;
+        DecodeBBoxesAll(allLocationPredictions, priorBBoxes, priorVariances, _num,
+                        _shareLocation, _numLocClasses, _backgroundLabelId,
+                        _codeType, _varianceEncodedInTarget, &allDecodedBBoxes);
+
+        int numKept = 0;
+        std::vector<std::map<int, std::vector<int> > > allIndices;
+        for (int i = 0; i < _num; ++i)
         {
-            if (c == _backgroundLabelId)
-            {
-                // Ignore background class.
-                continue;
-            }
-            if (confidenceScores.find(c) == confidenceScores.end())
+            const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
+            const std::map<int, std::vector<float> >& confidenceScores =
+            allConfidenceScores[i];
+            std::map<int, std::vector<int> > indices;
+            int numDetections = 0;
+            for (int c = 0; c < (int)_numClasses; ++c)
             {
-                // Something bad happened if there are no predictions for current label.
-                util::make_error<int>("Could not find confidence predictions for label ", c);
-            }
+                if (c == _backgroundLabelId)
+                {
+                    // Ignore background class.
+                    continue;
+                }
+                if (confidenceScores.find(c) == confidenceScores.end())
+                {
+                    // Something bad happened if there are no predictions for current label.
+                    util::make_error<int>("Could not find confidence predictions for label ", c);
+                }
 
-            const std::vector<float>& scores = confidenceScores.find(c)->second;
-            int label = _shareLocation ? -1 : c;
-            if (decodeBBoxes.find(label) == decodeBBoxes.end())
-            {
-                // Something bad happened if there are no predictions for current label.
-                util::make_error<int>("Could not find location predictions for label ", label);
-                continue;
-            }
-            const std::vector<caffe::NormalizedBBox>& bboxes =
-                decodeBBoxes.find(label)->second;
-            ApplyNMSFast(bboxes, scores, _confidenceThreshold, _nmsThreshold,
-                         _topK, &(indices[c]));
-            numDetections += indices[c].size();
-        }
-        if (_keepTopK > -1 && numDetections > _keepTopK)
-        {
-            std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
-            for (std::map<int, std::vector<int> >::iterator it = indices.begin();
-                 it != indices.end(); ++it)
-            {
-                int label = it->first;
-                const std::vector<int>& labelIndices = it->second;
-                if (confidenceScores.find(label) == confidenceScores.end())
+                const std::vector<float>& scores = confidenceScores.find(c)->second;
+                int label = _shareLocation ? -1 : c;
+                if (decodeBBoxes.find(label) == decodeBBoxes.end())
                 {
-                    // Something bad happened for current label.
+                    // Something bad happened if there are no predictions for current label.
                     util::make_error<int>("Could not find location predictions for label ", label);
                     continue;
                 }
-                const std::vector<float>& scores = confidenceScores.find(label)->second;
-                for (size_t j = 0; j < labelIndices.size(); ++j)
+                const std::vector<caffe::NormalizedBBox>& bboxes =
+                decodeBBoxes.find(label)->second;
+                ApplyNMSFast(bboxes, scores, _confidenceThreshold, _nmsThreshold,
+                             _topK, &(indices[c]));
+                numDetections += indices[c].size();
+            }
+            if (_keepTopK > -1 && numDetections > _keepTopK)
+            {
+                std::vector<std::pair<float, std::pair<int, int> > > scoreIndexPairs;
+                for (std::map<int, std::vector<int> >::iterator it = indices.begin();
+                     it != indices.end(); ++it)
+                {
+                    int label = it->first;
+                    const std::vector<int>& labelIndices = it->second;
+                    if (confidenceScores.find(label) == confidenceScores.end())
+                    {
+                        // Something bad happened for current label.
+                        util::make_error<int>("Could not find location predictions for label ", label);
+                        continue;
+                    }
+                    const std::vector<float>& scores = confidenceScores.find(label)->second;
+                    for (size_t j = 0; j < labelIndices.size(); ++j)
+                    {
+                        size_t idx = labelIndices[j];
+                        CV_Assert(idx < scores.size());
+                        scoreIndexPairs.push_back(
+                                                  std::make_pair(scores[idx], std::make_pair(label, idx)));
+                    }
+                }
+                // Keep outputs k results per image.
+                std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
+                          util::SortScorePairDescend<std::pair<int, int> >);
+                scoreIndexPairs.resize(_keepTopK);
+                // Store the new indices.
+                std::map<int, std::vector<int> > newIndices;
+                for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
                 {
-                    size_t idx = labelIndices[j];
-                    CV_Assert(idx < scores.size());
-                    scoreIndexPairs.push_back(
-                        std::make_pair(scores[idx], std::make_pair(label, idx)));
+                    int label = scoreIndexPairs[j].second.first;
+                    int idx = scoreIndexPairs[j].second.second;
+                    newIndices[label].push_back(idx);
                 }
+                allIndices.push_back(newIndices);
+                numKept += _keepTopK;
             }
-            // Keep outputs k results per image.
-            std::sort(scoreIndexPairs.begin(), scoreIndexPairs.end(),
-                      util::SortScorePairDescend<std::pair<int, int> >);
-            scoreIndexPairs.resize(_keepTopK);
-            // Store the new indices.
-            std::map<int, std::vector<int> > newIndices;
-            for (size_t j = 0; j < scoreIndexPairs.size(); ++j)
+            else
             {
-                int label = scoreIndexPairs[j].second.first;
-                int idx = scoreIndexPairs[j].second.second;
-                newIndices[label].push_back(idx);
+                allIndices.push_back(indices);
+                numKept += numDetections;
             }
-            allIndices.push_back(newIndices);
-            numKept += _keepTopK;
         }
-        else
+
+        if (numKept == 0)
         {
-            allIndices.push_back(indices);
-            numKept += numDetections;
+            CV_ErrorNoReturn(Error::StsError, "Couldn't find any detections");
+            return;
         }
-    }
+        int outputShape[] = {1, 1, numKept, 7};
+        outputs[0].create(4, outputShape, CV_32F);
+        float* outputsData = outputs[0].ptr<float>();
 
-    if (numKept == 0)
-    {
-        CV_ErrorNoReturn(Error::StsError, "Couldn't find any detections");
-        return;
-    }
-    std::vector<int> outputsShape(2, 1);
-    outputsShape.push_back(numKept);
-    outputsShape.push_back(7);
-    outputs[0].create(outputsShape);
-    float* outputsData = outputs[0].ptrf();
-
-    int count = 0;
-    for (int i = 0; i < _num; ++i)
-    {
-        const std::map<int, std::vector<float> >& confidenceScores =
-            allConfidenceScores[i];
-        const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
-        for (std::map<int, std::vector<int> >::iterator it = allIndices[i].begin();
-             it != allIndices[i].end(); ++it)
+        int count = 0;
+        for (int i = 0; i < _num; ++i)
         {
-            int label = it->first;
-            if (confidenceScores.find(label) == confidenceScores.end())
-            {
-                // Something bad happened if there are no predictions for current label.
-                util::make_error<int>("Could not find confidence predictions for label ", label);
-                continue;
-            }
-            const std::vector<float>& scores = confidenceScores.find(label)->second;
-            int locLabel = _shareLocation ? -1 : label;
-            if (decodeBBoxes.find(locLabel) == decodeBBoxes.end())
+            const std::map<int, std::vector<float> >& confidenceScores =
+            allConfidenceScores[i];
+            const LabelBBox& decodeBBoxes = allDecodedBBoxes[i];
+            for (std::map<int, std::vector<int> >::iterator it = allIndices[i].begin();
+                 it != allIndices[i].end(); ++it)
             {
-                // Something bad happened if there are no predictions for current label.
-                util::make_error<int>("Could not find location predictions for label ", locLabel);
-                continue;
-            }
-            const std::vector<caffe::NormalizedBBox>& bboxes =
+                int label = it->first;
+                if (confidenceScores.find(label) == confidenceScores.end())
+                {
+                    // Something bad happened if there are no predictions for current label.
+                    util::make_error<int>("Could not find confidence predictions for label ", label);
+                    continue;
+                }
+                const std::vector<float>& scores = confidenceScores.find(label)->second;
+                int locLabel = _shareLocation ? -1 : label;
+                if (decodeBBoxes.find(locLabel) == decodeBBoxes.end())
+                {
+                    // Something bad happened if there are no predictions for current label.
+                    util::make_error<int>("Could not find location predictions for label ", locLabel);
+                    continue;
+                }
+                const std::vector<caffe::NormalizedBBox>& bboxes =
                 decodeBBoxes.find(locLabel)->second;
-            std::vector<int>& indices = it->second;
+                std::vector<int>& indices = it->second;
 
-            for (size_t j = 0; j < indices.size(); ++j)
-            {
-                int idx = indices[j];
-                outputsData[count * 7] = i;
-                outputsData[count * 7 + 1] = label;
-                outputsData[count * 7 + 2] = scores[idx];
-                caffe::NormalizedBBox clipBBox;
-                ClipBBox(bboxes[idx], &clipBBox);
-                outputsData[count * 7 + 3] = clipBBox.xmin();
-                outputsData[count * 7 + 4] = clipBBox.ymin();
-                outputsData[count * 7 + 5] = clipBBox.xmax();
-                outputsData[count * 7 + 6] = clipBBox.ymax();
-
-                ++count;
+                for (size_t j = 0; j < indices.size(); ++j)
+                {
+                    int idx = indices[j];
+                    outputsData[count * 7] = i;
+                    outputsData[count * 7 + 1] = label;
+                    outputsData[count * 7 + 2] = scores[idx];
+                    caffe::NormalizedBBox clipBBox;
+                    ClipBBox(bboxes[idx], &clipBBox);
+                    outputsData[count * 7 + 3] = clipBBox.xmin();
+                    outputsData[count * 7 + 4] = clipBBox.ymin();
+                    outputsData[count * 7 + 5] = clipBBox.xmax();
+                    outputsData[count * 7 + 6] = clipBBox.ymax();
+
+                    ++count;
+                }
             }
         }
     }
-}
 
-float DetectionOutputLayer::BBoxSize(const caffe::NormalizedBBox& bbox,
-                                     const bool normalized)
-{
-    if (bbox.xmax() < bbox.xmin() || bbox.ymax() < bbox.ymin())
-    {
-        // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-        return 0;
-    }
-    else
+    // Compute bbox size.
+    float BBoxSize(const caffe::NormalizedBBox& bbox,
+                   const bool normalized=true)
     {
-        if (bbox.has_size())
+        if (bbox.xmax() < bbox.xmin() || bbox.ymax() < bbox.ymin())
         {
-            return bbox.size();
+            // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+            return 0;
         }
         else
         {
-            float width = bbox.xmax() - bbox.xmin();
-            float height = bbox.ymax() - bbox.ymin();
-            if (normalized)
+            if (bbox.has_size())
             {
-                return width * height;
+                return bbox.size();
             }
             else
             {
-                // If bbox is not within range [0, 1].
-                return (width + 1) * (height + 1);
+                float width = bbox.xmax() - bbox.xmin();
+                float height = bbox.ymax() - bbox.ymin();
+                if (normalized)
+                {
+                    return width * height;
+                }
+                else
+                {
+                    // If bbox is not within range [0, 1].
+                    return (width + 1) * (height + 1);
+                }
             }
         }
     }
-}
 
-void DetectionOutputLayer::ClipBBox(const caffe::NormalizedBBox& bbox,
-                                    caffe::NormalizedBBox* clipBBox)
-{
-    clipBBox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
-    clipBBox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
-    clipBBox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
-    clipBBox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
-    clipBBox->clear_size();
-    clipBBox->set_size(BBoxSize(*clipBBox));
-    clipBBox->set_difficult(bbox.difficult());
-}
+    // Clip the caffe::NormalizedBBox such that the range for each corner is [0, 1].
+    void ClipBBox(const caffe::NormalizedBBox& bbox,
+                  caffe::NormalizedBBox* clipBBox)
+    {
+        clipBBox->set_xmin(std::max(std::min(bbox.xmin(), 1.f), 0.f));
+        clipBBox->set_ymin(std::max(std::min(bbox.ymin(), 1.f), 0.f));
+        clipBBox->set_xmax(std::max(std::min(bbox.xmax(), 1.f), 0.f));
+        clipBBox->set_ymax(std::max(std::min(bbox.ymax(), 1.f), 0.f));
+        clipBBox->clear_size();
+        clipBBox->set_size(BBoxSize(*clipBBox));
+        clipBBox->set_difficult(bbox.difficult());
+    }
 
-void DetectionOutputLayer::DecodeBBox(
-    const caffe::NormalizedBBox& priorBBox, const std::vector<float>& priorVariance,
-    const CodeType codeType, const bool varianceEncodedInTarget,
-    const caffe::NormalizedBBox& bbox, caffe::NormalizedBBox* decodeBBox)
-{
-    if (codeType == caffe::PriorBoxParameter_CodeType_CORNER)
+    // Decode a bbox according to a prior bbox.
+    void DecodeBBox(const caffe::NormalizedBBox& priorBBox, const std::vector<float>& priorVariance,
+                    const CodeType codeType, const bool varianceEncodedInTarget,
+                    const caffe::NormalizedBBox& bbox, caffe::NormalizedBBox* decodeBBox)
     {
-        if (varianceEncodedInTarget)
+        if (codeType == caffe::PriorBoxParameter_CodeType_CORNER)
         {
-            // variance is encoded in target, we simply need to add the offset
-            // predictions.
-            decodeBBox->set_xmin(priorBBox.xmin() + bbox.xmin());
-            decodeBBox->set_ymin(priorBBox.ymin() + bbox.ymin());
-            decodeBBox->set_xmax(priorBBox.xmax() + bbox.xmax());
-            decodeBBox->set_ymax(priorBBox.ymax() + bbox.ymax());
+            if (varianceEncodedInTarget)
+            {
+                // variance is encoded in target, we simply need to add the offset
+                // predictions.
+                decodeBBox->set_xmin(priorBBox.xmin() + bbox.xmin());
+                decodeBBox->set_ymin(priorBBox.ymin() + bbox.ymin());
+                decodeBBox->set_xmax(priorBBox.xmax() + bbox.xmax());
+                decodeBBox->set_ymax(priorBBox.ymax() + bbox.ymax());
+            }
+            else
+            {
+                // variance is encoded in bbox, we need to scale the offset accordingly.
+                decodeBBox->set_xmin(
+                                     priorBBox.xmin() + priorVariance[0] * bbox.xmin());
+                decodeBBox->set_ymin(
+                                     priorBBox.ymin() + priorVariance[1] * bbox.ymin());
+                decodeBBox->set_xmax(
+                                     priorBBox.xmax() + priorVariance[2] * bbox.xmax());
+                decodeBBox->set_ymax(
+                                     priorBBox.ymax() + priorVariance[3] * bbox.ymax());
+            }
         }
-        else
+        else if (codeType == caffe::PriorBoxParameter_CodeType_CENTER_SIZE)
         {
-            // variance is encoded in bbox, we need to scale the offset accordingly.
-            decodeBBox->set_xmin(
-                priorBBox.xmin() + priorVariance[0] * bbox.xmin());
-            decodeBBox->set_ymin(
-                priorBBox.ymin() + priorVariance[1] * bbox.ymin());
-            decodeBBox->set_xmax(
-                priorBBox.xmax() + priorVariance[2] * bbox.xmax());
-            decodeBBox->set_ymax(
-                priorBBox.ymax() + priorVariance[3] * bbox.ymax());
-        }
-    }
-    else
-    if (codeType == caffe::PriorBoxParameter_CodeType_CENTER_SIZE)
-    {
-        float priorWidth = priorBBox.xmax() - priorBBox.xmin();
-        CV_Assert(priorWidth > 0);
+            float priorWidth = priorBBox.xmax() - priorBBox.xmin();
+            CV_Assert(priorWidth > 0);
 
-        float priorHeight = priorBBox.ymax() - priorBBox.ymin();
-        CV_Assert(priorHeight > 0);
+            float priorHeight = priorBBox.ymax() - priorBBox.ymin();
+            CV_Assert(priorHeight > 0);
 
-        float priorCenterX = (priorBBox.xmin() + priorBBox.xmax()) / 2.;
-        float priorCenterY = (priorBBox.ymin() + priorBBox.ymax()) / 2.;
+            float priorCenterX = (priorBBox.xmin() + priorBBox.xmax()) / 2.;
+            float priorCenterY = (priorBBox.ymin() + priorBBox.ymax()) / 2.;
 
-        float decodeBBoxCenterX, decodeBBoxCenterY;
-        float decodeBBoxWidth, decodeBBoxHeight;
-        if (varianceEncodedInTarget)
-        {
-            // variance is encoded in target, we simply need to retore the offset
-            // predictions.
-            decodeBBoxCenterX = bbox.xmin() * priorWidth + priorCenterX;
-            decodeBBoxCenterY = bbox.ymin() * priorHeight + priorCenterY;
-            decodeBBoxWidth = exp(bbox.xmax()) * priorWidth;
-            decodeBBoxHeight = exp(bbox.ymax()) * priorHeight;
-        }
-        else
-        {
-            // variance is encoded in bbox, we need to scale the offset accordingly.
-            decodeBBoxCenterX =
+            float decodeBBoxCenterX, decodeBBoxCenterY;
+            float decodeBBoxWidth, decodeBBoxHeight;
+            if (varianceEncodedInTarget)
+            {
+                // variance is encoded in target, we simply need to retore the offset
+                // predictions.
+                decodeBBoxCenterX = bbox.xmin() * priorWidth + priorCenterX;
+                decodeBBoxCenterY = bbox.ymin() * priorHeight + priorCenterY;
+                decodeBBoxWidth = exp(bbox.xmax()) * priorWidth;
+                decodeBBoxHeight = exp(bbox.ymax()) * priorHeight;
+            }
+            else
+            {
+                // variance is encoded in bbox, we need to scale the offset accordingly.
+                decodeBBoxCenterX =
                 priorVariance[0] * bbox.xmin() * priorWidth + priorCenterX;
-            decodeBBoxCenterY =
+                decodeBBoxCenterY =
                 priorVariance[1] * bbox.ymin() * priorHeight + priorCenterY;
-            decodeBBoxWidth =
+                decodeBBoxWidth =
                 exp(priorVariance[2] * bbox.xmax()) * priorWidth;
-            decodeBBoxHeight =
+                decodeBBoxHeight =
                 exp(priorVariance[3] * bbox.ymax()) * priorHeight;
-        }
+            }
 
-        decodeBBox->set_xmin(decodeBBoxCenterX - decodeBBoxWidth / 2.);
-        decodeBBox->set_ymin(decodeBBoxCenterY - decodeBBoxHeight / 2.);
-        decodeBBox->set_xmax(decodeBBoxCenterX + decodeBBoxWidth / 2.);
-        decodeBBox->set_ymax(decodeBBoxCenterY + decodeBBoxHeight / 2.);
-    }
-    else
-    {
-        CV_Error(Error::StsBadArg, "Unknown LocLossType.");
+            decodeBBox->set_xmin(decodeBBoxCenterX - decodeBBoxWidth / 2.);
+            decodeBBox->set_ymin(decodeBBoxCenterY - decodeBBoxHeight / 2.);
+            decodeBBox->set_xmax(decodeBBoxCenterX + decodeBBoxWidth / 2.);
+            decodeBBox->set_ymax(decodeBBoxCenterY + decodeBBoxHeight / 2.);
+        }
+        else
+        {
+            CV_Error(Error::StsBadArg, "Unknown LocLossType.");
+        }
+        float bboxSize = BBoxSize(*decodeBBox);
+        decodeBBox->set_size(bboxSize);
     }
-    float bboxSize = BBoxSize(*decodeBBox);
-    decodeBBox->set_size(bboxSize);
-}
 
-void DetectionOutputLayer::DecodeBBoxes(
-    const std::vector<caffe::NormalizedBBox>& priorBBoxes,
-    const std::vector<std::vector<float> >& priorVariances,
-    const CodeType codeType, const bool varianceEncodedInTarget,
-    const std::vector<caffe::NormalizedBBox>& bboxes,
-    std::vector<caffe::NormalizedBBox>* decodeBBoxes)
-{
-    CV_Assert(priorBBoxes.size() == priorVariances.size());
-    CV_Assert(priorBBoxes.size() == bboxes.size());
-    int numBBoxes = priorBBoxes.size();
-    if (numBBoxes >= 1)
+    // Decode a set of bboxes according to a set of prior bboxes.
+    void DecodeBBoxes(const std::vector<caffe::NormalizedBBox>& priorBBoxes,
+                      const std::vector<std::vector<float> >& priorVariances,
+                      const CodeType codeType, const bool varianceEncodedInTarget,
+                      const std::vector<caffe::NormalizedBBox>& bboxes,
+                      std::vector<caffe::NormalizedBBox>* decodeBBoxes)
     {
-        CV_Assert(priorVariances[0].size() == 4);
-    }
-    decodeBBoxes->clear();
-    for (int i = 0; i < numBBoxes; ++i)
-    {
-        caffe::NormalizedBBox decodeBBox;
-        DecodeBBox(priorBBoxes[i], priorVariances[i], codeType,
-                   varianceEncodedInTarget, bboxes[i], &decodeBBox);
-        decodeBBoxes->push_back(decodeBBox);
+        CV_Assert(priorBBoxes.size() == priorVariances.size());
+        CV_Assert(priorBBoxes.size() == bboxes.size());
+        int numBBoxes = priorBBoxes.size();
+        if (numBBoxes >= 1)
+        {
+            CV_Assert(priorVariances[0].size() == 4);
+        }
+        decodeBBoxes->clear();
+        for (int i = 0; i < numBBoxes; ++i)
+        {
+            caffe::NormalizedBBox decodeBBox;
+            DecodeBBox(priorBBoxes[i], priorVariances[i], codeType,
+                       varianceEncodedInTarget, bboxes[i], &decodeBBox);
+            decodeBBoxes->push_back(decodeBBox);
+        }
     }
-}
 
-void DetectionOutputLayer::DecodeBBoxesAll(
-    const std::vector<LabelBBox>& allLocPreds,
-    const std::vector<caffe::NormalizedBBox>& priorBBoxes,
-    const std::vector<std::vector<float> >& priorVariances,
-    const size_t num, const bool shareLocation,
-    const int numLocClasses, const int backgroundLabelId,
-    const CodeType codeType, const bool varianceEncodedInTarget,
-    std::vector<LabelBBox>* allDecodeBBoxes)
-{
-    CV_Assert(allLocPreds.size() == num);
-    allDecodeBBoxes->clear();
-    allDecodeBBoxes->resize(num);
-    for (size_t i = 0; i < num; ++i)
+    // Decode all bboxes in a batch.
+    void DecodeBBoxesAll(const std::vector<LabelBBox>& allLocPreds,
+                         const std::vector<caffe::NormalizedBBox>& priorBBoxes,
+                         const std::vector<std::vector<float> >& priorVariances,
+                         const size_t num, const bool shareLocation,
+                         const int numLocClasses, const int backgroundLabelId,
+                         const CodeType codeType, const bool varianceEncodedInTarget,
+                         std::vector<LabelBBox>* allDecodeBBoxes)
     {
-        // Decode predictions into bboxes.
-        LabelBBox& decodeBBoxes = (*allDecodeBBoxes)[i];
-        for (int c = 0; c < numLocClasses; ++c)
+        CV_Assert(allLocPreds.size() == num);
+        allDecodeBBoxes->clear();
+        allDecodeBBoxes->resize(num);
+        for (size_t i = 0; i < num; ++i)
         {
-            int label = shareLocation ? -1 : c;
-            if (label == backgroundLabelId)
-            {
-                // Ignore background class.
-                continue;
-            }
-            if (allLocPreds[i].find(label) == allLocPreds[i].end())
+            // Decode predictions into bboxes.
+            LabelBBox& decodeBBoxes = (*allDecodeBBoxes)[i];
+            for (int c = 0; c < numLocClasses; ++c)
             {
-                // Something bad happened if there are no predictions for current label.
-                util::make_error<int>("Could not find location predictions for label ", label);
-            }
-            const std::vector<caffe::NormalizedBBox>& labelLocPreds =
+                int label = shareLocation ? -1 : c;
+                if (label == backgroundLabelId)
+                {
+                    // Ignore background class.
+                    continue;
+                }
+                if (allLocPreds[i].find(label) == allLocPreds[i].end())
+                {
+                    // Something bad happened if there are no predictions for current label.
+                    util::make_error<int>("Could not find location predictions for label ", label);
+                }
+                const std::vector<caffe::NormalizedBBox>& labelLocPreds =
                 allLocPreds[i].find(label)->second;
-            DecodeBBoxes(priorBBoxes, priorVariances,
-                         codeType, varianceEncodedInTarget,
-                         labelLocPreds, &(decodeBBoxes[label]));
+                DecodeBBoxes(priorBBoxes, priorVariances,
+                             codeType, varianceEncodedInTarget,
+                             labelLocPreds, &(decodeBBoxes[label]));
+            }
         }
     }
-}
 
-void DetectionOutputLayer::GetPriorBBoxes(const float* priorData, const int& numPriors,
-                                          std::vector<caffe::NormalizedBBox>* priorBBoxes,
-                                          std::vector<std::vector<float> >* priorVariances)
-{
-    priorBBoxes->clear();
-    priorVariances->clear();
-    for (int i = 0; i < numPriors; ++i)
+    // Get prior bounding boxes from prior_data.
+    //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
+    //    num_priors: number of priors.
+    //    prior_bboxes: stores all the prior bboxes in the format of caffe::NormalizedBBox.
+    //    prior_variances: stores all the variances needed by prior bboxes.
+    void GetPriorBBoxes(const float* priorData, const int& numPriors,
+                        std::vector<caffe::NormalizedBBox>* priorBBoxes,
+                        std::vector<std::vector<float> >* priorVariances)
     {
-        int startIdx = i * 4;
-        caffe::NormalizedBBox bbox;
-        bbox.set_xmin(priorData[startIdx]);
-        bbox.set_ymin(priorData[startIdx + 1]);
-        bbox.set_xmax(priorData[startIdx + 2]);
-        bbox.set_ymax(priorData[startIdx + 3]);
-        float bboxSize = BBoxSize(bbox);
-        bbox.set_size(bboxSize);
-        priorBBoxes->push_back(bbox);
-    }
+        priorBBoxes->clear();
+        priorVariances->clear();
+        for (int i = 0; i < numPriors; ++i)
+        {
+            int startIdx = i * 4;
+            caffe::NormalizedBBox bbox;
+            bbox.set_xmin(priorData[startIdx]);
+            bbox.set_ymin(priorData[startIdx + 1]);
+            bbox.set_xmax(priorData[startIdx + 2]);
+            bbox.set_ymax(priorData[startIdx + 3]);
+            float bboxSize = BBoxSize(bbox);
+            bbox.set_size(bboxSize);
+            priorBBoxes->push_back(bbox);
+        }
 
-    for (int i = 0; i < numPriors; ++i)
-    {
-        int startIdx = (numPriors + i) * 4;
-        std::vector<float> var;
-        for (int j = 0; j < 4; ++j)
+        for (int i = 0; i < numPriors; ++i)
         {
-            var.push_back(priorData[startIdx + j]);
+            int startIdx = (numPriors + i) * 4;
+            std::vector<float> var;
+            for (int j = 0; j < 4; ++j)
+            {
+                var.push_back(priorData[startIdx + j]);
+            }
+            priorVariances->push_back(var);
         }
-        priorVariances->push_back(var);
     }
-}
-
-void DetectionOutputLayer::ScaleBBox(const caffe::NormalizedBBox& bbox,
-                                     const int height, const int width,
-                                     caffe::NormalizedBBox* scaleBBox)
-{
-    scaleBBox->set_xmin(bbox.xmin() * width);
-    scaleBBox->set_ymin(bbox.ymin() * height);
-    scaleBBox->set_xmax(bbox.xmax() * width);
-    scaleBBox->set_ymax(bbox.ymax() * height);
-    scaleBBox->clear_size();
-    bool normalized = !(width > 1 || height > 1);
-    scaleBBox->set_size(BBoxSize(*scaleBBox, normalized));
-    scaleBBox->set_difficult(bbox.difficult());
-}
 
-
-void DetectionOutputLayer::GetLocPredictions(
-    const float* locData, const int num,
-    const int numPredsPerClass, const int numLocClasses,
-    const bool shareLocation, std::vector<LabelBBox>* locPreds)
-{
-    locPreds->clear();
-    if (shareLocation)
+    // Scale the caffe::NormalizedBBox w.r.t. height and width.
+    void ScaleBBox(const caffe::NormalizedBBox& bbox,
+                   const int height, const int width,
+                   caffe::NormalizedBBox* scaleBBox)
     {
-        CV_Assert(numLocClasses == 1);
+        scaleBBox->set_xmin(bbox.xmin() * width);
+        scaleBBox->set_ymin(bbox.ymin() * height);
+        scaleBBox->set_xmax(bbox.xmax() * width);
+        scaleBBox->set_ymax(bbox.ymax() * height);
+        scaleBBox->clear_size();
+        bool normalized = !(width > 1 || height > 1);
+        scaleBBox->set_size(BBoxSize(*scaleBBox, normalized));
+        scaleBBox->set_difficult(bbox.difficult());
     }
-    locPreds->resize(num);
-    for (int i = 0; i < num; ++i)
+
+    // Get location predictions from loc_data.
+    //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
+    //    num: the number of images.
+    //    num_preds_per_class: number of predictions per class.
+    //    num_loc_classes: number of location classes. It is 1 if share_location is
+    //      true; and is equal to number of classes needed to predict otherwise.
+    //    share_location: if true, all classes share the same location prediction.
+    //    loc_preds: stores the location prediction, where each item contains
+    //      location prediction for an image.
+    void GetLocPredictions(const float* locData, const int num,
+                           const int numPredsPerClass, const int numLocClasses,
+                           const bool shareLocation, std::vector<LabelBBox>* locPreds)
     {
-        LabelBBox& labelBBox = (*locPreds)[i];
-        for (int p = 0; p < numPredsPerClass; ++p)
+        locPreds->clear();
+        if (shareLocation)
         {
-            int startIdx = p * numLocClasses * 4;
-            for (int c = 0; c < numLocClasses; ++c)
+            CV_Assert(numLocClasses == 1);
+        }
+        locPreds->resize(num);
+        for (int i = 0; i < num; ++i)
+        {
+            LabelBBox& labelBBox = (*locPreds)[i];
+            for (int p = 0; p < numPredsPerClass; ++p)
             {
-                int label = shareLocation ? -1 : c;
-                if (labelBBox.find(label) == labelBBox.end())
+                int startIdx = p * numLocClasses * 4;
+                for (int c = 0; c < numLocClasses; ++c)
                 {
-                    labelBBox[label].resize(numPredsPerClass);
+                    int label = shareLocation ? -1 : c;
+                    if (labelBBox.find(label) == labelBBox.end())
+                    {
+                        labelBBox[label].resize(numPredsPerClass);
+                    }
+                    labelBBox[label][p].set_xmin(locData[startIdx + c * 4]);
+                    labelBBox[label][p].set_ymin(locData[startIdx + c * 4 + 1]);
+                    labelBBox[label][p].set_xmax(locData[startIdx + c * 4 + 2]);
+                    labelBBox[label][p].set_ymax(locData[startIdx + c * 4 + 3]);
                 }
-                labelBBox[label][p].set_xmin(locData[startIdx + c * 4]);
-                labelBBox[label][p].set_ymin(locData[startIdx + c * 4 + 1]);
-                labelBBox[label][p].set_xmax(locData[startIdx + c * 4 + 2]);
-                labelBBox[label][p].set_ymax(locData[startIdx + c * 4 + 3]);
             }
+            locData += numPredsPerClass * numLocClasses * 4;
         }
-        locData += numPredsPerClass * numLocClasses * 4;
     }
-}
 
-void DetectionOutputLayer::GetConfidenceScores(
-    const float* confData, const int num,
-    const int numPredsPerClass, const int numClasses,
-    std::vector<std::map<int, std::vector<float> > >* confPreds)
-{
-    confPreds->clear();
-    confPreds->resize(num);
-    for (int i = 0; i < num; ++i)
+    // Get confidence predictions from conf_data.
+    //    conf_data: num x num_preds_per_class * num_classes blob.
+    //    num: the number of images.
+    //    num_preds_per_class: number of predictions per class.
+    //    num_classes: number of classes.
+    //    conf_preds: stores the confidence prediction, where each item contains
+    //      confidence prediction for an image.
+    void GetConfidenceScores(const float* confData, const int num,
+                             const int numPredsPerClass, const int numClasses,
+                             std::vector<std::map<int, std::vector<float> > >* confPreds)
     {
-        std::map<int, std::vector<float> >& labelScores = (*confPreds)[i];
-        for (int p = 0; p < numPredsPerClass; ++p)
+        confPreds->clear();
+        confPreds->resize(num);
+        for (int i = 0; i < num; ++i)
         {
-            int startIdx = p * numClasses;
-            for (int c = 0; c < numClasses; ++c)
+            std::map<int, std::vector<float> >& labelScores = (*confPreds)[i];
+            for (int p = 0; p < numPredsPerClass; ++p)
             {
-                labelScores[c].push_back(confData[startIdx + c]);
+                int startIdx = p * numClasses;
+                for (int c = 0; c < numClasses; ++c)
+                {
+                    labelScores[c].push_back(confData[startIdx + c]);
+                }
             }
+            confData += numPredsPerClass * numClasses;
         }
-        confData += numPredsPerClass * numClasses;
     }
-}
 
-void DetectionOutputLayer::ApplyNMSFast(const std::vector<caffe::NormalizedBBox>& bboxes,
-                                        const std::vector<float>& scores,
-                                        const float score_threshold,
-                                        const float nms_threshold, const int top_k,
-                                        std::vector<int>* indices)
-{
-    // Sanity check.
-    CV_Assert(bboxes.size() == scores.size());
+    // Do non maximum suppression given bboxes and scores.
+    // Inspired by Piotr Dollar's NMS implementation in EdgeBox.
+    // https://goo.gl/jV3JYS
+    //    bboxes: a set of bounding boxes.
+    //    scores: a set of corresponding confidences.
+    //    score_threshold: a threshold used to filter detection results.
+    //    nms_threshold: a threshold used in non maximum suppression.
+    //    top_k: if not -1, keep at most top_k picked indices.
+    //    indices: the kept indices of bboxes after nms.
+    void ApplyNMSFast(const std::vector<caffe::NormalizedBBox>& bboxes,
+                      const std::vector<float>& scores,
+                      const float score_threshold,
+                      const float nms_threshold, const int top_k,
+                      std::vector<int>* indices)
+    {
+        // Sanity check.
+        CV_Assert(bboxes.size() == scores.size());
 
-    // Get top_k scores (with corresponding indices).
-    std::vector<std::pair<float, int> > score_index_vec;
-    GetMaxScoreIndex(scores, score_threshold, top_k, &score_index_vec);
+        // Get top_k scores (with corresponding indices).
+        std::vector<std::pair<float, int> > score_index_vec;
+        GetMaxScoreIndex(scores, score_threshold, top_k, &score_index_vec);
 
-    // Do nms.
-    indices->clear();
-    while (score_index_vec.size() != 0)
-    {
-        const int idx = score_index_vec.front().second;
-        bool keep = true;
-        for (size_t k = 0; k < indices->size(); ++k)
+        // Do nms.
+        indices->clear();
+        while (score_index_vec.size() != 0)
         {
-            if (keep)
+            const int idx = score_index_vec.front().second;
+            bool keep = true;
+            for (size_t k = 0; k < indices->size(); ++k)
             {
-                const int kept_idx = (*indices)[k];
-                float overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
-                keep = overlap <= nms_threshold;
+                if (keep)
+                {
+                    const int kept_idx = (*indices)[k];
+                    float overlap = JaccardOverlap(bboxes[idx], bboxes[kept_idx]);
+                    keep = overlap <= nms_threshold;
+                }
+                else
+                {
+                    break;
+                }
             }
-            else
+            if (keep)
             {
-                break;
+                indices->push_back(idx);
             }
+            score_index_vec.erase(score_index_vec.begin());
         }
-        if (keep)
-        {
-            indices->push_back(idx);
-        }
-        score_index_vec.erase(score_index_vec.begin());
     }
-}
 
-
-void DetectionOutputLayer::GetMaxScoreIndex(
-    const std::vector<float>& scores, const float threshold,const int top_k,
-    std::vector<std::pair<float, int> >* score_index_vec)
-{
-    // Generate index score pairs.
-    for (size_t i = 0; i < scores.size(); ++i)
+    // Get max scores with corresponding indices.
+    //    scores: a set of scores.
+    //    threshold: only consider scores higher than the threshold.
+    //    top_k: if -1, keep all; otherwise, keep at most top_k.
+    //    score_index_vec: store the sorted (score, index) pair.
+    void GetMaxScoreIndex(const std::vector<float>& scores, const float threshold,const int top_k,
+                          std::vector<std::pair<float, int> >* score_index_vec)
     {
-        if (scores[i] > threshold)
+        // Generate index score pairs.
+        for (size_t i = 0; i < scores.size(); ++i)
         {
-            score_index_vec->push_back(std::make_pair(scores[i], i));
+            if (scores[i] > threshold)
+            {
+                score_index_vec->push_back(std::make_pair(scores[i], i));
+            }
         }
-    }
 
-    // Sort the score pair according to the scores in descending order
-    std::stable_sort(score_index_vec->begin(), score_index_vec->end(),
-                     util::SortScorePairDescend<int>);
+        // Sort the score pair according to the scores in descending order
+        std::stable_sort(score_index_vec->begin(), score_index_vec->end(),
+                         util::SortScorePairDescend<int>);
 
-    // Keep top_k scores if needed.
-    if (top_k > -1 && top_k < (int)score_index_vec->size())
-    {
-        score_index_vec->resize(top_k);
+        // Keep top_k scores if needed.
+        if (top_k > -1 && top_k < (int)score_index_vec->size())
+        {
+            score_index_vec->resize(top_k);
+        }
     }
-}
 
-void DetectionOutputLayer::IntersectBBox(const caffe::NormalizedBBox& bbox1,
-                                         const caffe::NormalizedBBox& bbox2,
-                                         caffe::NormalizedBBox* intersect_bbox) {
-    if (bbox2.xmin() > bbox1.xmax() || bbox2.xmax() < bbox1.xmin() ||
-        bbox2.ymin() > bbox1.ymax() || bbox2.ymax() < bbox1.ymin())
-    {
-        // Return [0, 0, 0, 0] if there is no intersection.
-        intersect_bbox->set_xmin(0);
-        intersect_bbox->set_ymin(0);
-        intersect_bbox->set_xmax(0);
-        intersect_bbox->set_ymax(0);
-    }
-    else
-    {
-        intersect_bbox->set_xmin(std::max(bbox1.xmin(), bbox2.xmin()));
-        intersect_bbox->set_ymin(std::max(bbox1.ymin(), bbox2.ymin()));
-        intersect_bbox->set_xmax(std::min(bbox1.xmax(), bbox2.xmax()));
-        intersect_bbox->set_ymax(std::min(bbox1.ymax(), bbox2.ymax()));
+    // Compute the intersection between two bboxes.
+    void IntersectBBox(const caffe::NormalizedBBox& bbox1,
+                       const caffe::NormalizedBBox& bbox2,
+                       caffe::NormalizedBBox* intersect_bbox) {
+        if (bbox2.xmin() > bbox1.xmax() || bbox2.xmax() < bbox1.xmin() ||
+            bbox2.ymin() > bbox1.ymax() || bbox2.ymax() < bbox1.ymin())
+        {
+            // Return [0, 0, 0, 0] if there is no intersection.
+            intersect_bbox->set_xmin(0);
+            intersect_bbox->set_ymin(0);
+            intersect_bbox->set_xmax(0);
+            intersect_bbox->set_ymax(0);
+        }
+        else
+        {
+            intersect_bbox->set_xmin(std::max(bbox1.xmin(), bbox2.xmin()));
+            intersect_bbox->set_ymin(std::max(bbox1.ymin(), bbox2.ymin()));
+            intersect_bbox->set_xmax(std::min(bbox1.xmax(), bbox2.xmax()));
+            intersect_bbox->set_ymax(std::min(bbox1.ymax(), bbox2.ymax()));
+        }
     }
-}
 
-float DetectionOutputLayer::JaccardOverlap(const caffe::NormalizedBBox& bbox1,
-                                           const caffe::NormalizedBBox& bbox2,
-                                           const bool normalized) {
-    caffe::NormalizedBBox intersect_bbox;
-    IntersectBBox(bbox1, bbox2, &intersect_bbox);
-    float intersect_width, intersect_height;
-    if (normalized)
-    {
-        intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin();
-        intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
-    }
-    else
-    {
-        intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin() + 1;
-        intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin() + 1;
-    }
-    if (intersect_width > 0 && intersect_height > 0)
-    {
-        float intersect_size = intersect_width * intersect_height;
-        float bbox1_size = BBoxSize(bbox1);
-        float bbox2_size = BBoxSize(bbox2);
-        return intersect_size / (bbox1_size + bbox2_size - intersect_size);
-    }
-    else
+    // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
+    float JaccardOverlap(const caffe::NormalizedBBox& bbox1,
+                         const caffe::NormalizedBBox& bbox2,
+                         const bool normalized=true)
     {
-        return 0.;
+        caffe::NormalizedBBox intersect_bbox;
+        IntersectBBox(bbox1, bbox2, &intersect_bbox);
+        float intersect_width, intersect_height;
+        if (normalized)
+        {
+            intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin();
+            intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin();
+        }
+        else
+        {
+            intersect_width = intersect_bbox.xmax() - intersect_bbox.xmin() + 1;
+            intersect_height = intersect_bbox.ymax() - intersect_bbox.ymin() + 1;
+        }
+        if (intersect_width > 0 && intersect_height > 0)
+        {
+            float intersect_size = intersect_width * intersect_height;
+            float bbox1_size = BBoxSize(bbox1);
+            float bbox2_size = BBoxSize(bbox2);
+            return intersect_size / (bbox1_size + bbox2_size - intersect_size);
+        }
+        else
+        {
+            return 0.;
+        }
     }
+};
+
+const std::string DetectionOutputLayerImpl::_layerName = std::string("DetectionOutput");
+
+Ptr<DetectionOutputLayer> DetectionOutputLayer::create(const LayerParams &params)
+{
+    return Ptr<DetectionOutputLayer>(new DetectionOutputLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/detection_output_layer.hpp b/modules/dnn/src/layers/detection_output_layer.hpp
deleted file mode 100644
index 0b28d69a8ff..00000000000
--- a/modules/dnn/src/layers/detection_output_layer.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_DETECTION_OUTPUT_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_DETECTION_OUTPUT_LAYER_HPP__
-
-#include "../precomp.hpp"
-#include "caffe.pb.h"
-
-namespace cv
-{
-namespace dnn
-{
-class DetectionOutputLayer : public Layer
-{
-    unsigned _numClasses;
-    bool _shareLocation;
-    int _numLocClasses;
-
-    int _backgroundLabelId;
-
-    typedef caffe::PriorBoxParameter_CodeType CodeType;
-    CodeType _codeType;
-
-    bool _varianceEncodedInTarget;
-    int _keepTopK;
-    float _confidenceThreshold;
-
-    int _num;
-    int _numPriors;
-
-    float _nmsThreshold;
-    int _topK;
-
-    static const size_t _numAxes = 4;
-    static const std::string _layerName;
-
-public:
-    DetectionOutputLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void checkInputs(const std::vector<Blob*> &inputs);
-    void getCodeType(LayerParams &params);
-
-    template<typename T>
-    T getParameter(const LayerParams &params,
-                   const std::string &parameterName,
-                   const size_t &idx = 0,
-                   const bool required = true,
-                   const T& defaultValue = T());
-
-    bool getParameterDict(const LayerParams &params,
-                          const std::string &parameterName,
-                          DictValue& result);
-
-    typedef std::map<int, std::vector<caffe::NormalizedBBox> > LabelBBox;
-
-    // Clip the caffe::NormalizedBBox such that the range for each corner is [0, 1].
-    void ClipBBox(const caffe::NormalizedBBox& bbox, caffe::NormalizedBBox* clip_bbox);
-
-    // Decode a bbox according to a prior bbox.
-    void DecodeBBox(const caffe::NormalizedBBox& prior_bbox,
-                    const std::vector<float>& prior_variance, const CodeType code_type,
-                    const bool variance_encoded_in_target, const caffe::NormalizedBBox& bbox,
-                    caffe::NormalizedBBox* decode_bbox);
-
-    // Decode a set of bboxes according to a set of prior bboxes.
-    void DecodeBBoxes(const std::vector<caffe::NormalizedBBox>& prior_bboxes,
-                      const std::vector<std::vector<float> >& prior_variances,
-                      const CodeType code_type, const bool variance_encoded_in_target,
-                      const std::vector<caffe::NormalizedBBox>& bboxes,
-                      std::vector<caffe::NormalizedBBox>* decode_bboxes);
-
-    // Decode all bboxes in a batch.
-    void DecodeBBoxesAll(const std::vector<LabelBBox>& all_loc_pred,
-                         const std::vector<caffe::NormalizedBBox>& prior_bboxes,
-                         const std::vector<std::vector<float> >& prior_variances,
-                         const size_t num, const bool share_location,
-                         const int num_loc_classes, const int background_label_id,
-                         const CodeType code_type, const bool variance_encoded_in_target,
-                         std::vector<LabelBBox>* all_decode_bboxes);
-
-    // Get prior bounding boxes from prior_data.
-    //    prior_data: 1 x 2 x num_priors * 4 x 1 blob.
-    //    num_priors: number of priors.
-    //    prior_bboxes: stores all the prior bboxes in the format of caffe::NormalizedBBox.
-    //    prior_variances: stores all the variances needed by prior bboxes.
-    void GetPriorBBoxes(const float* priorData, const int& numPriors,
-                        std::vector<caffe::NormalizedBBox>* priorBBoxes,
-                        std::vector<std::vector<float> >* priorVariances);
-
-    // Scale the caffe::NormalizedBBox w.r.t. height and width.
-    void ScaleBBox(const caffe::NormalizedBBox& bbox, const int height, const int width,
-                   caffe::NormalizedBBox* scale_bbox);
-
-    // Do non maximum suppression given bboxes and scores.
-    // Inspired by Piotr Dollar's NMS implementation in EdgeBox.
-    // https://goo.gl/jV3JYS
-    //    bboxes: a set of bounding boxes.
-    //    scores: a set of corresponding confidences.
-    //    score_threshold: a threshold used to filter detection results.
-    //    nms_threshold: a threshold used in non maximum suppression.
-    //    top_k: if not -1, keep at most top_k picked indices.
-    //    indices: the kept indices of bboxes after nms.
-    void ApplyNMSFast(const std::vector<caffe::NormalizedBBox>& bboxes,
-                      const std::vector<float>& scores, const float score_threshold,
-                      const float nms_threshold, const int top_k, std::vector<int>* indices);
-
-
-    // Do non maximum suppression given bboxes and scores.
-    //    bboxes: a set of bounding boxes.
-    //    scores: a set of corresponding confidences.
-    //    threshold: the threshold used in non maximu suppression.
-    //    top_k: if not -1, keep at most top_k picked indices.
-    //    reuse_overlaps: if true, use and update overlaps; otherwise, always
-    //      compute overlap.
-    //    overlaps: a temp place to optionally store the overlaps between pairs of
-    //      bboxes if reuse_overlaps is true.
-    //    indices: the kept indices of bboxes after nms.
-    void ApplyNMS(const std::vector<caffe::NormalizedBBox>& bboxes,
-                  const std::vector<float>& scores,
-                  const float threshold, const int top_k, const bool reuse_overlaps,
-                  std::map<int, std::map<int, float> >* overlaps, std::vector<int>* indices);
-
-    void ApplyNMS(const bool* overlapped, const int num, std::vector<int>* indices);
-
-    // Get confidence predictions from conf_data.
-    //    conf_data: num x num_preds_per_class * num_classes blob.
-    //    num: the number of images.
-    //    num_preds_per_class: number of predictions per class.
-    //    num_classes: number of classes.
-    //    conf_preds: stores the confidence prediction, where each item contains
-    //      confidence prediction for an image.
-    void GetConfidenceScores(const float* conf_data, const int num,
-                             const int num_preds_per_class, const int num_classes,
-                             std::vector<std::map<int, std::vector<float> > >* conf_scores);
-
-    // Get confidence predictions from conf_data.
-    //    conf_data: num x num_preds_per_class * num_classes blob.
-    //    num: the number of images.
-    //    num_preds_per_class: number of predictions per class.
-    //    num_classes: number of classes.
-    //    class_major: if true, data layout is
-    //      num x num_classes x num_preds_per_class; otherwise, data layerout is
-    //      num x num_preds_per_class * num_classes.
-    //    conf_preds: stores the confidence prediction, where each item contains
-    //      confidence prediction for an image.
-    void GetConfidenceScores(const float* conf_data, const int num,
-                             const int num_preds_per_class, const int num_classes,
-                             const bool class_major,
-                             std::vector<std::map<int, std::vector<float> > >* conf_scores);
-
-    // Get location predictions from loc_data.
-    //    loc_data: num x num_preds_per_class * num_loc_classes * 4 blob.
-    //    num: the number of images.
-    //    num_preds_per_class: number of predictions per class.
-    //    num_loc_classes: number of location classes. It is 1 if share_location is
-    //      true; and is equal to number of classes needed to predict otherwise.
-    //    share_location: if true, all classes share the same location prediction.
-    //    loc_preds: stores the location prediction, where each item contains
-    //      location prediction for an image.
-    void GetLocPredictions(const float* loc_data, const int num,
-                           const int num_preds_per_class, const int num_loc_classes,
-                           const bool share_location, std::vector<LabelBBox>* loc_preds);
-
-    // Get max scores with corresponding indices.
-    //    scores: a set of scores.
-    //    threshold: only consider scores higher than the threshold.
-    //    top_k: if -1, keep all; otherwise, keep at most top_k.
-    //    score_index_vec: store the sorted (score, index) pair.
-    void GetMaxScoreIndex(const std::vector<float>& scores, const float threshold,
-                          const int top_k, std::vector<std::pair<float, int> >* score_index_vec);
-
-    // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
-    float JaccardOverlap(const caffe::NormalizedBBox& bbox1, const caffe::NormalizedBBox& bbox2,
-                         const bool normalized = true);
-
-    // Compute the intersection between two bboxes.
-    void IntersectBBox(const caffe::NormalizedBBox& bbox1, const caffe::NormalizedBBox& bbox2,
-                       caffe::NormalizedBBox* intersect_bbox);
-
-    // Compute bbox size.
-    float BBoxSize(const caffe::NormalizedBBox& bbox, const bool normalized = true);
-};
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/elementwise_layers.cpp b/modules/dnn/src/layers/elementwise_layers.cpp
index 914e9c2fe84..74e5ab4ac71 100644
--- a/modules/dnn/src/layers/elementwise_layers.cpp
+++ b/modules/dnn/src/layers/elementwise_layers.cpp
@@ -1,92 +1,254 @@
 #include "../precomp.hpp"
-#include "elementwise_layers.hpp"
 #include "opencv2/imgproc.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
 {
 namespace dnn
 {
 
-#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
-Ptr<_Layer> _Layer::create() { \
-    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
+using std::abs;
+using std::exp;
+using std::tanh;
+using std::pow;
 
-
-Ptr<ReLULayer> ReLULayer::create(double negativeSlope)
+template<typename Func>
+class ElementWiseLayer : public Func::Layer
 {
-    Ptr<ReLULayer> layer(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
-    layer->negativeSlope = negativeSlope;
-    return layer;
-}
+    Func func;
+
+    template<typename Dtype>
+    class PBody : public cv::ParallelLoopBody
+    {
+        Func &func;
+        Dtype *data;
+    public:
+
+        PBody(Mat &mat, Func &func_) :
+            func(func_), data(mat.ptr<Dtype>())
+        {}
+
+        void operator()(const Range &r) const
+        {
+            for (int i = r.start; i < r.end; i++)
+                data[i] = func(data[i]);
+        }
+    };
+
+public:
+
+    ElementWiseLayer(const Func &f=Func()) : func(f) {}
+
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            outputs[i] = *inputs[i];
+        }
+    }
+
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            const Mat &src = *inputs[i];
+            Mat &dst = outputs[i];
+            CV_Assert(src.ptr() == dst.ptr() && src.isContinuous());
+
+            Range sizeRange = Range(0, dst.total());
+            CV_Assert(src.type() == CV_32F);
+            cv::parallel_for_(sizeRange, PBody<float>(dst, func));
+        }
+    }
+};
 
-Ptr<TanHLayer> TanHLayer::create()
+struct ReLUFunctor
 {
-    return Ptr<TanHLayer>(new ElementWiseLayer<TanHFunctor>());
-}
+    typedef ReLULayer Layer;
+    float slope;
 
-Ptr<SigmoidLayer> SigmoidLayer::create()
+    ReLUFunctor(float slope_) : slope(slope_) {}
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return (x >= (TFloat)0) ? x : (TFloat)slope * x;
+    }
+};
+
+struct TanHFunctor
 {
-    return Ptr<SigmoidLayer>(new ElementWiseLayer<SigmoidFunctor>());
-}
+    typedef TanHLayer Layer;
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return tanh(x);
+    }
+};
 
-Ptr<AbsLayer> AbsLayer::create()
+struct SigmoidFunctor
 {
-    return Ptr<AbsLayer>(new ElementWiseLayer<AbsValFunctor>());
-}
+    typedef SigmoidLayer Layer;
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return (TFloat)1 / ((TFloat)1 + exp(-x));
+    }
+};
 
-Ptr<BNLLLayer> BNLLLayer::create()
+struct AbsValFunctor
 {
-    return Ptr<BNLLLayer>(new ElementWiseLayer<BNLLFunctor>());
-}
+    typedef AbsLayer Layer;
 
-Ptr<PowerLayer> PowerLayer::create(double power /*= 1*/, double scale /*= 1*/, double shift /*= 0*/)
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return abs(x);
+    }
+};
+
+struct BNLLFunctor
 {
-    const PowerFunctor f(power, scale, shift);
-    Ptr<PowerLayer> layer(new ElementWiseLayer<PowerFunctor>(f));
-    layer->power = power;
-    layer->scale = scale;
-    layer->shift = shift;
-    return layer;
-}
+    typedef BNLLLayer Layer;
 
-////////////////////////////////////////////////////////////////////////////
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
+    {
+        return log((TFloat)1 + exp(-abs(x)));
+    }
+};
 
-void ChannelsPReLULayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+struct PowerFunctor
 {
-    CV_Assert(blobs.size() == 1);
+    typedef PowerLayer Layer;
 
-    outputs.resize(inputs.size());
-    for (size_t i = 0; i < inputs.size(); i++)
+    const float power;
+    const float scale;
+    const float shift;
+
+    PowerFunctor(float power_, float scale_ = 1.f, float shift_ = 0)
+        : power(power_), scale(scale_), shift(shift_) {}
+
+    template<typename TFloat>
+    inline TFloat operator()(TFloat x) const
     {
-        outputs[i].create(inputs[i]->shape());
+        return power == 1.0f ? (TFloat)shift + (TFloat)scale * x :
+            pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
     }
-}
+};
 
-void ChannelsPReLULayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+class ChannelsPReLULayerImpl : public ChannelsPReLULayer
 {
-    CV_Assert(inputs.size() == 1);
+public:
+    ChannelsPReLULayerImpl(const LayerParams& params)
+    {
+        CV_Assert(params.blobs.size() == 1);
+        setParamsFrom(params);
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
 
-    Blob &inpBlob = *inputs[0];
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(blobs.size() == 1);
+
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            outputs[i].create(inputs[i]->dims, inputs[i]->size.p, inputs[i]->type());
+        }
+    }
 
-    for (size_t ii = 0; ii < outputs.size(); ii++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        Blob &outBlob = outputs[ii];
+        CV_Assert(inputs.size() == 1);
 
-        CV_Assert(blobs[0].total() == inpBlob.channels());
+        Mat &inpBlob = *inputs[0];
 
-        for (int n = 0; n < inpBlob.channels(); n++)
+        for (size_t ii = 0; ii < outputs.size(); ii++)
         {
-            float slopeWeight = blobs[0].matRefConst().at<float>(n);
+            Mat &outBlob = outputs[ii];
 
-            cv::threshold(inpBlob.getPlane(0, n), outBlob.getPlane(0, n), 0, 0, cv::THRESH_TOZERO_INV);
-            outBlob.getPlane(0, n) = inpBlob.getPlane(0, n) + (slopeWeight - 1)*outBlob.getPlane(0, n);
+            CV_Assert(blobs[0].total() == inpBlob.size[1]);
+
+            for (int n = 0; n < inpBlob.size[1]; n++)
+            {
+                float slopeWeight = blobs[0].at<float>(n);
+
+                Mat inpBlobPlane = getPlane(inpBlob, 0, n);
+                Mat outBlobPlane = getPlane(outBlob, 0, n);
+
+                threshold(inpBlobPlane, outBlobPlane, 0, 0, cv::THRESH_TOZERO_INV);
+                scaleAdd(outBlobPlane, slopeWeight-1, inpBlobPlane, outBlobPlane);
+            }
         }
     }
+};
+
+#define ACTIVATION_CREATOR_FOR(_Layer, _Functor, ...) \
+Ptr<_Layer> _Layer::create() { \
+    return return Ptr<_Layer>( new ElementWiseLayer<_Functor>(_Functor()) ); }
+
+
+Ptr<ReLULayer> ReLULayer::create(const LayerParams& params)
+{
+    float negativeSlope = params.get<float>("negative_slope", 0.f);
+    Ptr<ReLULayer> l(new ElementWiseLayer<ReLUFunctor>(ReLUFunctor(negativeSlope)));
+    l->setParamsFrom(params);
+
+    return l;
 }
 
-Ptr<ChannelsPReLULayer> ChannelsPReLULayer::create()
+Ptr<TanHLayer> TanHLayer::create(const LayerParams& params)
+{
+    Ptr<TanHLayer> l(new ElementWiseLayer<TanHFunctor>());
+    l->setParamsFrom(params);
+
+    return l;
+}
+
+Ptr<SigmoidLayer> SigmoidLayer::create(const LayerParams& params)
+{
+    Ptr<SigmoidLayer> l(new ElementWiseLayer<SigmoidFunctor>());
+    l->setParamsFrom(params);
+
+    return l;
+}
+
+Ptr<AbsLayer> AbsLayer::create(const LayerParams& params)
+{
+    Ptr<AbsLayer> l(new ElementWiseLayer<AbsValFunctor>());
+    l->setParamsFrom(params);
+
+    return l;
+}
+
+Ptr<BNLLLayer> BNLLLayer::create(const LayerParams& params)
+{
+    Ptr<BNLLLayer> l(new ElementWiseLayer<BNLLFunctor>());
+    l->setParamsFrom(params);
+
+    return l;
+}
+
+Ptr<PowerLayer> PowerLayer::create(const LayerParams& params)
+{
+    float power = params.get<float>("power", 1.0f);
+    float scale = params.get<float>("scale", 1.0f);
+    float shift = params.get<float>("shift", 0.0f);
+    Ptr<PowerLayer> l(new ElementWiseLayer<PowerFunctor>(PowerFunctor(power, scale, shift)));
+    l->setParamsFrom(params);
+
+    return l;
+}
+
+
+Ptr<ChannelsPReLULayer> ChannelsPReLULayer::create(const LayerParams& params)
 {
-    return Ptr<ChannelsPReLULayer>(new ChannelsPReLULayerImpl());
+    return Ptr<ChannelsPReLULayer>(new ChannelsPReLULayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/elementwise_layers.hpp b/modules/dnn/src/layers/elementwise_layers.hpp
deleted file mode 100644
index 0c01812bf36..00000000000
--- a/modules/dnn/src/layers/elementwise_layers.hpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_ELEMENTWISE_LAYERS_HPP__
-#define __OPENCV_DNN_LAYERS_ELEMENTWISE_LAYERS_HPP__
-#include "../precomp.hpp"
-#include "layers_common.hpp"
-#include <cmath>
-#include <opencv2/dnn/all_layers.hpp>
-#include <opencv2/core/ocl.hpp>
-#include "opencl_kernels_dnn.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-
-using std::abs;
-using std::exp;
-using std::tanh;
-using std::pow;
-
-template<typename Func>
-class ElementWiseLayer : public Func::Layer
-{
-    bool useOpenCL;
-    Func func;
-
-    template<typename Dtype>
-    class PBody : public cv::ParallelLoopBody
-    {
-        Func &func;
-        Dtype *data;
-    public:
-
-        PBody(Mat &mat, Func &func_) :
-            func(func_), data(mat.ptr<Dtype>())
-        {}
-
-        void operator()(const Range &r) const
-        {
-            for (int i = r.start; i < r.end; i++)
-                data[i] = func(data[i]);
-        }
-    };
-
-public:
-
-    ElementWiseLayer() {}
-    ElementWiseLayer(const Func &f) : func(f) {}
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        useOpenCL = ocl::useOpenCL();
-
-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            outputs[i].shareFrom(*inputs[i]); //no data copy
-
-            //hotfix: shareFrom doesn't provide properly Mat/UMat switching
-            if (useOpenCL)
-                outputs[i].umatRef() = inputs[i]->umatRefConst();
-            else
-                outputs[i].matRef() = inputs[i]->matRefConst();
-        }
-    }
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        #ifdef HAVE_OPENCL
-        if (useOpenCL)
-            forwardOCL(inputs, outputs);
-        else
-        #endif
-            forwardCPU(inputs, outputs);
-    }
-
-    #ifdef HAVE_OPENCL
-    void forwardOCL(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
-
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            const UMat &src = inputs[i]->umatRefConst();
-            UMat &dst = outputs[i].umatRef();
-            CV_Assert(src.isContinuous() && dst.isContinuous() && !src.offset && !dst.offset);
-
-            ocl::Kernel ker;
-            CV_Assert(func.initKernel(ker, src));
-            ker.set(0, (int)src.total());
-            ker.set(1, ocl::KernelArg::PtrReadOnly(src));
-            ker.set(2, ocl::KernelArg::PtrWriteOnly(dst));
-
-            size_t gSize = src.total();
-            CV_Assert(ker.run(1, &gSize, &wgSize, true));
-        }
-    }
-    #endif
-
-    void forwardCPU(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-    {
-        for (size_t i = 0; i < inputs.size(); i++)
-        {
-            const Mat &src = inputs[i]->matRefConst();
-            Mat &dst = outputs[i].matRef();
-            CV_Assert(src.ptr() == dst.ptr() && src.isContinuous());
-
-            Range sizeRange = Range(0, dst.total());
-            if (dst.type() == CV_32F)
-            {
-                cv::parallel_for_(sizeRange, PBody<float>(dst, func));
-            }
-            else if (dst.type() == CV_64F)
-            {
-                cv::parallel_for_(sizeRange, PBody<double>(dst, func));
-            }
-            else
-            {
-                CV_Error(Error::StsNotImplemented, "Only CV_32F and CV_64F blobs are supported");
-            }
-        }
-    }
-};
-
-#ifdef HAVE_OPENCL
-static String oclGetTMacro(const UMat &m)
-{
-    return String("-DT=") + ocl::typeToStr(m.type()) + String(" ");
-}
-#endif
-
-struct ReLUFunctor
-{
-    typedef ReLULayer Layer;
-
-    double slope;
-
-    ReLUFunctor(double slope_)
-        : slope(slope_) {}
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return (x >= (TFloat)0) ? x : (TFloat)slope * x;
-    }
-
-    #ifdef HAVE_OPENCL
-    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-    {
-        const char *buildoptSlope = (slope == 0) ? "-DRELU_NO_SLOPE" : "";
-        String buildopt = oclGetTMacro(src) + buildoptSlope;
-
-        if (!ker.create("ReLUForward", ocl::dnn::activations_oclsrc, buildopt))
-            return false;
-
-        if (slope != 0)
-            ker.set(3, (float)slope);
-
-        return true;
-    }
-    #endif
-};
-
-struct TanHFunctor
-{
-    typedef TanHLayer Layer;
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return tanh(x);
-    }
-
-    #ifdef HAVE_OPENCL
-    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-    {
-        if (!ker.create("TanHForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
-            return false;
-        return true;
-    }
-    #endif
-};
-
-struct SigmoidFunctor
-{
-    typedef SigmoidLayer Layer;
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return (TFloat)1 / ((TFloat)1 + exp(-x));
-    }
-
-    #ifdef HAVE_OPENCL
-    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-    {
-        if (!ker.create("SigmoidForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
-            return false;
-        return true;
-    }
-    #endif
-};
-
-struct AbsValFunctor
-{
-    typedef AbsLayer Layer;
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return abs(x);
-    }
-
-    #ifdef HAVE_OPENCL
-    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-    {
-        if (!ker.create("AbsValForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
-            return false;
-        return true;
-    }
-    #endif
-};
-
-struct BNLLFunctor
-{
-    typedef BNLLLayer Layer;
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return log((TFloat)1 + exp(-abs(x)));
-    }
-
-    #ifdef HAVE_OPENCL
-    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-    {
-        if (!ker.create("BNLLForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
-            return false;
-        return true;
-    }
-    #endif
-};
-
-struct PowerFunctor
-{
-    typedef PowerLayer Layer;
-
-    const double power;
-    const double scale;
-    const double shift;
-
-    PowerFunctor(double power_, double scale_ = 1, double shift_ = 0)
-        : power(power_), scale(scale_), shift(shift_) {}
-
-    template<typename TFloat>
-    inline TFloat operator()(TFloat x) const
-    {
-        return power == 1.0 ? (TFloat)shift + (TFloat)scale * x : pow((TFloat)shift + (TFloat)scale * x, (TFloat)power);
-    }
-
-    #ifdef HAVE_OPENCL
-    bool initKernel(ocl::Kernel &ker, const UMat &src) const
-    {
-        if (!ker.create("PowForward", ocl::dnn::activations_oclsrc, oclGetTMacro(src)))
-            return false;
-
-        ker.set(3, (float)power);
-        ker.set(4, (float)scale);
-        ker.set(5, (float)shift);
-
-        return true;
-    }
-    #endif
-};
-
-class ChannelsPReLULayerImpl : public ChannelsPReLULayer
-{
-public:
-    ChannelsPReLULayerImpl() {}
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/eltwise_layer.cpp b/modules/dnn/src/layers/eltwise_layer.cpp
index 3568ccdd4ad..2e88bbbe500 100755
--- a/modules/dnn/src/layers/eltwise_layer.cpp
+++ b/modules/dnn/src/layers/eltwise_layer.cpp
@@ -41,88 +41,117 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "eltwise_layer.hpp"
 
 namespace cv
 {
 namespace dnn
 {
-    EltwiseLayerImpl::EltwiseLayerImpl(EltwiseOp op_, const std::vector<int> &coeffs_)
+
+class EltwiseLayerImpl : public EltwiseLayer
+{
+public:
+    EltwiseOp op;
+    std::vector<int> coeffs;
+
+    EltwiseLayerImpl(const LayerParams& params)
     {
-        op = op_;
-        coeffs = coeffs_;
+        setParamsFrom(params);
+        op = EltwiseLayer::SUM;
+        if (params.has("operation"))
+        {
+            String operation = params.get<String>("operation").toLowerCase();
+            if (operation == "prod")
+                op = EltwiseLayer::PROD;
+            else if (operation == "sum")
+                op = EltwiseLayer::SUM;
+            else if (operation == "max")
+                op = EltwiseLayer::MAX;
+            else
+                CV_Error(cv::Error::StsBadArg, "Unknown operaticon type \"" + operation + "\"");
+        }
+
+        if (params.has("coeff"))
+        {
+            DictValue paramCoeff = params.get("coeff");
+            int i, n = paramCoeff.size();
+            coeffs.resize(n);
+            for (i = 0; i < n; i++)
+            {
+                coeffs[i] = paramCoeff.get<int>(i);
+            }
+        }
     }
 
-    void EltwiseLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
         CV_Assert(2 <= inputs.size());
         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
         CV_Assert(op == SUM || coeffs.size() == 0);
 
-        const BlobShape &shape0 = inputs[0]->shape();
         for (size_t i = 1; i < inputs.size(); ++i)
         {
-            BlobShape iShape = inputs[i]->shape();
-            CV_Assert(shape0 == iShape);
+            CV_Assert(inputs[i]->size == inputs[0]->size);
         }
         outputs.resize(1);
-        outputs[0].create(shape0);
+        outputs[0].create(inputs[0]->dims, inputs[0]->size.p, inputs[0]->type());
     }
 
-    void EltwiseLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
         switch (op)
         {
-        case SUM:
+            case SUM:
             {
                 CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
-                Mat& output = outputs[0].matRef();
+                Mat& output = outputs[0];
                 output.setTo(0.);
                 if (0 < coeffs.size())
                 {
                     for (size_t i = 0; i < inputs.size(); i++)
                     {
-                        output += inputs[i]->matRefConst() * coeffs[i];
+                        output += *inputs[i] * coeffs[i];
                     }
                 }
                 else
                 {
                     for (size_t i = 0; i < inputs.size(); i++)
                     {
-                        output += inputs[i]->matRefConst();
+                        output += *inputs[i];
                     }
                 }
             }
-            break;
-        case PROD:
+                break;
+            case PROD:
             {
-                Mat& output = outputs[0].matRef();
+                Mat& output = outputs[0];
                 output.setTo(1.);
                 for (size_t i = 0; i < inputs.size(); i++)
                 {
-                    output = output.mul(inputs[i]->matRefConst());
+                    output = output.mul(*inputs[i]);
                 }
             }
-            break;
-        case MAX:
+                break;
+            case MAX:
             {
-                Mat& output = outputs[0].matRef();
-                cv::max(inputs[0]->matRefConst(), inputs[1]->matRefConst(), output);
+                Mat& output = outputs[0];
+                cv::max(*inputs[0], *inputs[1], output);
                 for (size_t i = 2; i < inputs.size(); i++)
                 {
-                    cv::max(output, inputs[i]->matRefConst(), output);
+                    cv::max(output, *inputs[i], output);
                 }
             }
-            break;
-        default:
-            CV_Assert(0);
-            break;
-        };
+                break;
+            default:
+                CV_Assert(0);
+                break;
+        }
     }
+};
+
+Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
+{
+    return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
+}
 
-    Ptr<EltwiseLayer> EltwiseLayer::create(EltwiseOp op, const std::vector<int> &coeffs)
-    {
-        return Ptr<EltwiseLayer>(new EltwiseLayerImpl(op, coeffs));
-    }
 }
 }
diff --git a/modules/dnn/src/layers/eltwise_layer.hpp b/modules/dnn/src/layers/eltwise_layer.hpp
deleted file mode 100755
index c67575cde2a..00000000000
--- a/modules/dnn/src/layers/eltwise_layer.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_ELTWISE_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_ELTWISE_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-    class EltwiseLayerImpl : public EltwiseLayer
-    {
-        EltwiseOp op;
-        std::vector<int> coeffs;
-    public:
-        EltwiseLayerImpl(EltwiseOp op, const std::vector<int> &coeffs);
-        void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-        void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    };
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/flatten_layer.cpp b/modules/dnn/src/layers/flatten_layer.cpp
index dc8070393ff..05700f90c05 100644
--- a/modules/dnn/src/layers/flatten_layer.cpp
+++ b/modules/dnn/src/layers/flatten_layer.cpp
@@ -41,7 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "flatten_layer.hpp"
 #include <float.h>
 #include <algorithm>
 
@@ -50,68 +49,72 @@ namespace cv
 namespace dnn
 {
 
-FlattenLayer::FlattenLayer(LayerParams &params) : Layer(params)
+class FlattenLayerImpl : public FlattenLayer
 {
-    _startAxis = params.get<int>("axis", 1);
-    _endAxis = params.get<int>("end_axis", -1);
-}
+public:
+    FlattenLayerImpl(const LayerParams &params)
+    {
+        _startAxis = params.get<int>("axis", 1);
+        _endAxis = params.get<int>("end_axis", -1);
+        setParamsFrom(params);
+    }
 
-void FlattenLayer::checkInputs(const std::vector<Blob*> &inputs)
-{
-    CV_Assert(inputs.size() > 0);
-    for (size_t i = 1; i < inputs.size(); i++)
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        for (size_t j = 0; j < _numAxes; j++)
+        size_t i, ninputs = inputs.size();
+        CV_Assert(ninputs > 0);
+        const Mat& inp0 = *inputs[0];
+
+        for (i = 1; i < ninputs; i++)
         {
-            CV_Assert(inputs[i]->shape()[j] == inputs[0]->shape()[j]);
+            CV_Assert(inputs[i]->size == inp0.size);
         }
-    }
-}
 
-void FlattenLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    checkInputs(inputs);
+        _numAxes = inp0.dims;
+        _endAxis = _endAxis < 0 ? _endAxis + _numAxes : _endAxis;
+        CV_Assert(_startAxis >= 0);
+        CV_Assert(_endAxis >= _startAxis && _endAxis < (int)_numAxes);
 
-    _numAxes = inputs[0]->dims();
-    _endAxis = inputs[0]->canonicalAxis(_endAxis);
-    CV_Assert(_startAxis >= 0);
-    CV_Assert(_endAxis >= _startAxis && _endAxis < (int)_numAxes);
+        size_t flattenedDimensionSize = inp0.total(_startAxis, _endAxis+1);
 
-    size_t flattenedDimensionSize = 1;
-    for (int i = _startAxis; i <= _endAxis; i++)
-    {
-        flattenedDimensionSize *= inputs[0]->size(i);
-    }
+        resultShape.clear();
+        for (int j = 0; j < _startAxis; j++)
+        {
+            resultShape.push_back(inp0.size[j]);
+        }
+        resultShape.push_back(flattenedDimensionSize);
+        for (int j = _endAxis + 1; j < _numAxes; j++)
+        {
+            resultShape.push_back(inp0.size[j]);
+        }
+        CV_Assert(resultShape.size() <= 4);
 
-    std::vector<int> outputShapeVec;
-    for (int i = 0; i < _startAxis; i++)
-    {
-        outputShapeVec.push_back(inputs[0]->size(i));
+        for (i = 0; i < ninputs; i++)
+        {
+            //in-place
+            outputs[i] = inputs[i]->reshape(1, (int)resultShape.size(), &resultShape[0]);
+        }
     }
-    outputShapeVec.push_back(flattenedDimensionSize);
-    for (size_t i = _endAxis + 1; i < _numAxes; i++)
+
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        outputShapeVec.push_back(inputs[0]->size(i));
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            outputs[i] = inputs[i]->reshape(1, (int)resultShape.size(), &resultShape[0]);
+        }
     }
-    CV_Assert(outputShapeVec.size() <= 4);
 
-    resultShape = BlobShape(outputShapeVec);
+    int _startAxis;
+    int _endAxis;
+    size_t _numAxes;
 
-    for (size_t i = 0; i < inputs.size(); i++)
-    {
-        //in-place
-        outputs[i].shareFrom(*inputs[i]);
-        outputs[i].reshape(resultShape);
-    }
-}
+    std::vector<int> resultShape;
+};
 
-void FlattenLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+Ptr<FlattenLayer> FlattenLayer::create(const LayerParams& params)
 {
-    for (size_t j = 0; j < inputs.size(); j++)
-    {
-        outputs[j].shareFrom(*inputs[j]);
-        outputs[j].reshape(resultShape);
-    }
+    return Ptr<FlattenLayer>(new FlattenLayerImpl(params));
 }
+
 }
 }
diff --git a/modules/dnn/src/layers/fully_connected_layer.cpp b/modules/dnn/src/layers/fully_connected_layer.cpp
index bb5802743cb..bb296057aa7 100644
--- a/modules/dnn/src/layers/fully_connected_layer.cpp
+++ b/modules/dnn/src/layers/fully_connected_layer.cpp
@@ -41,88 +41,95 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "fully_connected_layer.hpp"
 #include "op_blas.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
-#include <opencv2/core/ocl.hpp>
 
 namespace cv
 {
 namespace dnn
 {
 
-FullyConnectedLayerImpl::FullyConnectedLayerImpl(int axis_)
+class FullyConnectedLayerImpl : public InnerProductLayer
 {
-    axis = axis_;
-}
-
-void FullyConnectedLayerImpl::allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
-{
-    CV_Assert(input.size() > 0);
-    CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
-    CV_Assert(blobs[0].dims() == 2);
-
-    bias = (blobs.size() >= 1);
-    axisCan = input[0]->canonicalAxis(axis);
-    dtype = input[0]->type();
-    numOutput = blobs[0].size(0);
-    innerSize = blobs[0].size(1);
-    outerSize = input[0]->total(0, axisCan);
-
-    CV_Assert((size_t)innerSize == input[0]->total(axisCan));
-    CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
+public:
+    FullyConnectedLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
 
-    useOpenCL = ocl::useOpenCL();
-    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_UMAT;
+        numOutput = params.get<int>("num_output");
+        innerSize = (int)blobs[0].total() / numOutput;
+        bias = params.get<bool>("bias_term", true);
+        axis = params.get<int>("axis", 1);
 
-    biasOnesBlob.create(Shape(outerSize, 1), dtype, allocFlags);
-    biasOnesBlob.setTo(1);
+        CV_Assert(blobs[0].dims >= 2 && (size_t)(innerSize * numOutput) == blobs[0].total());
+        CV_Assert(!bias || (blobs.size() == 2 && (size_t)numOutput == blobs[1].total()));
 
-    output.resize(input.size());
-    for (size_t i = 0; i < input.size(); i++)
-    {
-        CV_Assert(i == 0 || (input[i]->equalShape(*input[0]) && input[i]->type() == dtype));
-        Shape outShape = Shape(outerSize, numOutput);
-        output[i].create(outShape, dtype, allocFlags);
+        blobs[0] = blobs[0].reshape(1, numOutput);
+        if (bias)
+            blobs[1] = blobs[1].reshape(1, 1);
     }
-}
-
-void FullyConnectedLayerImpl::forward(std::vector<Blob*> &input, std::vector<Blob> &output)
-{
-    #ifdef HAVE_OPENCL
-    if (useOpenCL)
-        forward_<UMat>(input, output);
-    else
-    #endif
-        forward_<Mat>(input, output);
-}
 
-template<typename XMat>
-void FullyConnectedLayerImpl::forward_(std::vector<Blob *> &input, std::vector<Blob> &output)
-{
-    const XMat &weight = blobs[0].getRefConst<XMat>();
-    const XMat *biasMat = NULL, *biasOnesMat = NULL;
-    if (bias)
+    void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output)
     {
-        biasOnesMat = &biasOnesBlob.getRefConst<XMat>();
-        biasMat = &blobs[1].getRefConst<XMat>();
+        CV_Assert(input.size() > 0);
+        const Mat& inp0 = *input[0];
+
+        CV_Assert(1 <= blobs.size() && blobs.size() <= 2);
+        CV_Assert(blobs[0].dims == 2);
+
+        bias = (blobs.size() >= 1);
+        axisCan = axis < 0 ? axis + inp0.dims : axis;
+        dtype = inp0.type();
+        numOutput = blobs[0].size[0];
+        innerSize = blobs[0].size[1];
+        outerSize = inp0.total(0, axisCan);
+        size_t innerSize0 = inp0.total(axisCan);
+
+        CV_Assert((size_t)innerSize == innerSize0);
+        CV_Assert(!bias || (size_t)numOutput == blobs[1].total());
+
+        biasOnesBlob.create(outerSize, 1, dtype);
+        biasOnesBlob.setTo(1.);
+
+        output.resize(input.size());
+        for (size_t i = 0; i < input.size(); i++)
+        {
+            CV_Assert(i == 0 || (input[i]->size == input[0]->size && input[i]->type() == dtype));
+            output[i].create(outerSize, numOutput, dtype);
+        }
     }
 
-    for (size_t i = 0; i < input.size(); i++)
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output)
     {
-        const XMat srcMat = reshaped(input[i]->getRefConst<XMat>(), Shape(outerSize, innerSize));
-        XMat dstMat = reshaped(output[i].getRef<XMat>(), Shape(outerSize, numOutput));
-        dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
-
+        const Mat &weight = blobs[0];
+        const Mat *biasMat = NULL, *biasOnesMat = NULL;
         if (bias)
-            dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
+        {
+            biasOnesMat = &biasOnesBlob;
+            biasMat = &blobs[1];
+        }
+
+        for (size_t i = 0; i < input.size(); i++)
+        {
+            Mat srcMat = input[i]->reshape(1, outerSize);
+            Mat dstMat = output[i].reshape(1, outerSize);
+            dnn::gemm(srcMat, weight, 1, dstMat, 0, GEMM_2_T);
+
+            if (bias)
+                dnn::gemm(*biasOnesMat, *biasMat, 1, dstMat, 1);
+        }
     }
-}
 
+    int axisCan, dtype;
+    int numOutput, innerSize, outerSize;
+    bool bias;
+    Mat biasOnesBlob;
+};
 
-Ptr<InnerProductLayer> InnerProductLayer::create(int axis)
+Ptr<InnerProductLayer> InnerProductLayer::create(const LayerParams& params)
 {
-    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(axis));
+    return Ptr<InnerProductLayer>(new FullyConnectedLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/fully_connected_layer.hpp b/modules/dnn/src/layers/fully_connected_layer.hpp
deleted file mode 100644
index 0cf59401139..00000000000
--- a/modules/dnn/src/layers/fully_connected_layer.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_FULLY_CONNECTED_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class FullyConnectedLayerImpl : public InnerProductLayer
-{
-    int axisCan, dtype;
-    int numOutput, innerSize, outerSize;
-    bool bias, useOpenCL;
-    Blob biasOnesBlob;
-
-    template<typename XMat>
-    void forward_(std::vector<Blob*> &input, std::vector<Blob> &output);
-
-public:
-
-    FullyConnectedLayerImpl(int axisCan = 1);
-    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/layers_common.cpp b/modules/dnn/src/layers/layers_common.cpp
index c1f586a269b..cc40ab271da 100644
--- a/modules/dnn/src/layers/layers_common.cpp
+++ b/modules/dnn/src/layers/layers_common.cpp
@@ -54,7 +54,8 @@ std::string makeName(const std::string& str1, const std::string& str2)
     return str1 + str2;
 }
 
-bool getParameter(LayerParams &params, const std::string& nameBase, const std::string& nameAll, int &parameterH, int &parameterW, bool hasDefault = false, const int& defaultValue = 0)
+bool getParameter(const LayerParams &params, const std::string& nameBase, const std::string& nameAll,
+                  int &parameterH, int &parameterW, bool hasDefault = false, const int& defaultValue = 0)
 {
     std::string nameH = makeName(nameBase, std::string("_h"));
     std::string nameW = makeName(nameBase, std::string("_w"));
@@ -92,7 +93,7 @@ bool getParameter(LayerParams &params, const std::string& nameBase, const std::s
     }
 }
 
-void getKernelSize(LayerParams &params, int &kernelH, int &kernelW)
+void getKernelSize(const LayerParams &params, int &kernelH, int &kernelW)
 {
     if(!util::getParameter(params, "kernel", "kernel_size", kernelH, kernelW))
     {
@@ -102,7 +103,7 @@ void getKernelSize(LayerParams &params, int &kernelH, int &kernelW)
     CV_Assert(kernelH > 0 && kernelW > 0);
 }
 
-void getStrideAndPadding(LayerParams &params, int &padH, int &padW, int &strideH, int &strideW, cv::String& padMode)
+void getStrideAndPadding(const LayerParams &params, int &padH, int &padW, int &strideH, int &strideW, cv::String& padMode)
 {
     util::getParameter(params, "pad", "pad", padH, padW, true, 0);
     util::getParameter(params, "stride", "stride", strideH, strideW, true, 1);
@@ -118,7 +119,7 @@ void getStrideAndPadding(LayerParams &params, int &padH, int &padW, int &strideH
 }
 
 
-void getPoolingKernelParams(LayerParams &params, int &kernelH, int &kernelW, bool &globalPooling,
+void getPoolingKernelParams(const LayerParams &params, int &kernelH, int &kernelW, bool &globalPooling,
                             int &padH, int &padW, int &strideH, int &strideW, cv::String &padMode)
 {
     util::getStrideAndPadding(params, padH, padW, strideH, strideW, padMode);
@@ -142,7 +143,7 @@ void getPoolingKernelParams(LayerParams &params, int &kernelH, int &kernelW, boo
     }
 }
 
-void getConvolutionKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW,
+void getConvolutionKernelParams(const LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW,
                                 int &strideH, int &strideW, int &dilationH, int &dilationW, cv::String &padMode)
 {
     util::getKernelSize(params, kernelH, kernelW);
diff --git a/modules/dnn/src/layers/layers_common.hpp b/modules/dnn/src/layers/layers_common.hpp
index b27afafe7ba..78e6ace3e78 100644
--- a/modules/dnn/src/layers/layers_common.hpp
+++ b/modules/dnn/src/layers/layers_common.hpp
@@ -50,15 +50,16 @@ namespace cv
 namespace dnn
 {
 
-void getConvolutionKernelParams(LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW,
+void getConvolutionKernelParams(const LayerParams &params, int &kernelH, int &kernelW, int &padH, int &padW,
                                 int &strideH, int &strideW, int &dilationH, int &dilationW, cv::String& padMode);
 
-void getPoolingKernelParams(LayerParams &params, int &kernelH, int &kernelW, bool &globalPooling,
+void getPoolingKernelParams(const LayerParams &params, int &kernelH, int &kernelW, bool &globalPooling,
                             int &padH, int &padW, int &strideH, int &strideW, cv::String& padMode);
 
 void getConvPoolOutParams(const int inputH, const int inputW, const cv::Size& kernel,
                           const cv::Size& stride, cv::Size &pad, const cv::String& padMode,
                           int &outH, int &outW);
+
 }
 }
 
diff --git a/modules/dnn/src/layers/lrn_layer.cpp b/modules/dnn/src/layers/lrn_layer.cpp
index 9ecb58a4e07..4f8f06aee0f 100644
--- a/modules/dnn/src/layers/lrn_layer.cpp
+++ b/modules/dnn/src/layers/lrn_layer.cpp
@@ -41,10 +41,7 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "lrn_layer.hpp"
-#include "opencl_kernels_dnn.hpp"
 #include <opencv2/imgproc.hpp>
-#include <opencv2/core/ocl.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
 #include <algorithm>
 
@@ -53,207 +50,142 @@ namespace cv
 namespace dnn
 {
 
-LRNLayerImpl::LRNLayerImpl(int type_, int size_, double alpha_, double beta_, double bias_, bool normBySize_)
+class LRNLayerImpl : public LRNLayer
 {
-    type = type_;
-    size = size_;
-    alpha = alpha_;
-    beta = beta_;
-    bias = bias_;
-    normBySize = normBySize_;
-}
+public:
+    LRNLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        type = -1;
+        String nrmType = params.get<String>("norm_region", "ACROSS_CHANNELS");
+        if (nrmType == "ACROSS_CHANNELS")
+            type = LRNLayer::CHANNEL_NRM;
+        else if (nrmType == "WITHIN_CHANNEL")
+            type = LRNLayer::SPATIAL_NRM;
+        else
+            CV_Error(Error::StsBadArg, "Unknown region type \"" + nrmType + "\"");
+
+        size = params.get<int>("local_size", 5);
+        if (size % 2 != 1 || size <= 0)
+            CV_Error(Error::StsBadArg, "LRN layer supports only positive odd values for local_size");
+
+        alpha = params.get<double>("alpha", 1);
+        beta = params.get<double>("beta", 0.75);
+        bias = params.get<double>("bias", 1);
+        normBySize = params.get<bool>("norm_by_size", true);
+    }
 
-void LRNLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 1 && inputs[0]->dims() == 4);
-    CV_Assert(type == CHANNEL_NRM || type == SPATIAL_NRM);
-    useOpenCL = cv::ocl::useOpenCL();
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() == 1 && inputs[0]->dims == 4);
+        CV_Assert(type == CHANNEL_NRM || type == SPATIAL_NRM);
 
-    if (type == SPATIAL_NRM && !useOpenCL)
-        buf.create(inputs[0]->shape().slice(2), inputs[0]->type(), Blob::ALLOC_MAT);
-    if (type == CHANNEL_NRM && useOpenCL)
-        buf.create(inputs[0]->shape().slice(2), inputs[0]->type(), Blob::ALLOC_UMAT);
+        const Mat& inp0 = *inputs[0];
 
-    outputs.resize(1);
-    outputs[0].create(inputs[0]->shape(), inputs[0]->type());
-}
+        if (type == SPATIAL_NRM)
+            buf.create(inp0.size[2], inp0.size[3], inp0.type());
 
-void LRNLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    Blob &src = *inputs[0];
-    Blob &dst = outputs[0];
-
-    switch (type)
-    {
-    case CHANNEL_NRM:
-        channelNoramlization(src, dst);
-        break;
-    case SPATIAL_NRM:
-        spatialNormalization(src, dst);
-        break;
-    default:
-        CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
-        break;
+        outputs.resize(1);
+        outputs[0].create(inp0.dims, inp0.size.p, inp0.type());
     }
-}
-
-template<typename XMat>
-static XMat getPlane(XMat &m, int n, int cn)
-{
-    return reshaped(slice(m, n, cn), BlobShape::like(m).slice(2));
-}
 
-void LRNLayerImpl::channelNoramlization(Blob &src, Blob &dst)
-{
-    if (!useOpenCL)
-        channelNormalization_<Mat>(src, dst);
-    else
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        //channelNoramlization_ocl(src.getRefConst<UMat>(), dst.getRef<UMat>()); //consumes a lot of memory
-        channelNormalization_<UMat>(src, dst);
-    }
-}
+        Mat &src = *inputs[0];
+        Mat &dst = outputs[0];
 
-template<typename XMat>
-void LRNLayerImpl::channelNormalization_(Blob &srcBlob, Blob &dstBlob)
-{
-    int num = srcBlob.num();
-    int channels = srcBlob.channels();
-    int ksize = (size - 1) / 2;
-    int sizeNormFactor = normBySize ? size : 1;
-
-    XMat srcMat = srcBlob.getRefConst<XMat>().clone();
-    XMat dstMat = dstBlob.getRef<XMat>();
+        switch (type)
+        {
+            case CHANNEL_NRM:
+                channelNormalization(src, dst);
+                break;
+            case SPATIAL_NRM:
+                spatialNormalization(src, dst);
+                break;
+            default:
+                CV_Error(Error::StsNotImplemented, "Unimplemented mode of LRN layer");
+                break;
+        }
+    }
 
-    for (int n = 0; n < num; n++)
+    void channelNormalization(Mat &srcBlob, Mat &dstBlob)
     {
-        XMat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
-        accum.setTo(0);
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int ksize = (size - 1) / 2;
+        int sizeNormFactor = normBySize ? size : 1;
 
-        for (int cn = 0; cn < std::min(ksize, channels); cn++)
-            cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
+        Mat srcMat = srcBlob.clone();
+        Mat dstMat = dstBlob;
 
-        for (int cn = 0; cn < channels; cn++)
+        for (int n = 0; n < num; n++)
         {
-            if (cn + ksize < channels)
-            {
-                cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
-            }
+            Mat accum = getPlane(dstMat, n, channels-1); //trick for memory saving
+            accum.setTo(0);
 
-            if (cn - ksize - 1 >= 0)
+            for (int cn = 0; cn < std::min(ksize, channels); cn++)
+                cv::accumulateSquare(getPlane(srcMat, n, cn), accum);
+
+            for (int cn = 0; cn < channels; cn++)
             {
-                //subtractSquare
-                XMat left = getPlane(srcMat, n, cn - ksize - 1);
-                cv::pow(left, 2, left);
-                cv::subtract(accum, left, accum);
+                if (cn + ksize < channels)
+                {
+                    cv::accumulateSquare(getPlane(srcMat, n, cn + ksize), accum);
+                }
+
+                if (cn - ksize - 1 >= 0)
+                {
+                    //subtractSquare
+                    Mat left = getPlane(srcMat, n, cn - ksize - 1);
+                    cv::pow(left, 2, left);
+                    cv::subtract(accum, left, accum);
+                }
+
+                Mat dst = getPlane(dstMat, n, cn);
+                accum.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
+                cv::pow(dst, beta, dst);
+                cv::divide(getPlane(srcMat, n, cn), dst, dst);
             }
-
-            XMat dst = getPlane(dstMat, n, cn);
-            accum.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
-            cv::pow(dst, beta, dst);
-            cv::divide(getPlane(srcMat, n, cn), dst, dst);
         }
     }
-}
 
-bool LRNLayerImpl::channelNormalization_ocl(const UMat &src, UMat &dst)
-{
-#ifdef HAVE_OPENCL
-    if (src.offset != 0 || dst.offset != 0) //TODO: add offset
-        return false;
-
-    String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
-
-    ocl::Kernel kerScale("LRNFillScale", ocl::dnn::lrn_oclsrc, buildOpts);
-    if (kerScale.empty())
-        return false;
-
-    ocl::Kernel kerOutput("LRNComputeOutput", ocl::dnn::lrn_oclsrc, buildOpts);
-    if (kerOutput.empty())
-        return false;
-
-    Shape shape = Shape::like(src);
-    int ksize = (size - 1) / 2;
-    int sizeNormFactor = normBySize ? size : 1;
-    // TODO: add bias
-    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
-    UMat &scaleBuf = buf.umatRef();
-
-    size_t nthreads = (size_t)(shape.total() / shape[1]);
-    kerScale.args((int)nthreads,
-                  ocl::KernelArg::PtrReadOnly(src), shape[0], shape[1], shape[2], shape[3],
-                  size, (float)(alpha/sizeNormFactor), (float)ksize, ocl::KernelArg::PtrWriteOnly(scaleBuf));
-    if (!kerScale.run(1, &nthreads, &wgSize, true))
-        return false;
-
-    nthreads = (size_t)shape.total();
-    kerOutput.args((int)nthreads,
-                   ocl::KernelArg::PtrReadOnly(src), ocl::KernelArg::PtrReadOnly(scaleBuf),
-                   -beta, ocl::KernelArg::PtrWriteOnly(dst) );
-    if (!kerOutput.run(1, &nthreads, &wgSize, true))
-        return false;
-
-    return true;
-#else
-    (void)src;
-    (void)dst;
-    return false;
-#endif
-}
-
-void LRNLayerImpl::spatialNormalization(Blob &src, Blob &dst)
-{
-    if (!useOpenCL)
-        spatialNormalization_<Mat>(src, dst);
-    else
-        spatialNormalization_<UMat>(src, dst);
-}
-
-//TODO: fix cv::boxFilter with BORDER_ISOLATED flag in CPU mode
-template<>
-void LRNLayerImpl::sqrBoxFilter_<Mat>(const Mat &src, Mat &dst)
-{
-    Mat srcRawWrapper(src.rows, src.cols, src.type(), src.data, src.step[0]);
-    cv::sqrBoxFilter(srcRawWrapper, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
-}
-
-template<>
-void LRNLayerImpl::sqrBoxFilter_<UMat>(const UMat &src, UMat &dst)
-{
-    cv::sqrBoxFilter(src, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT | BORDER_ISOLATED);
-}
+    void sqrBoxFilter_(const Mat &src, Mat &dst)
+    {
+        Mat srcRawWrapper(src.rows, src.cols, src.type(), src.data, src.step[0]);
+        cv::sqrBoxFilter(srcRawWrapper, dst, dst.depth(), Size(size, size), Point(-1, -1), false, BORDER_CONSTANT);
+    }
 
-template<typename XMat>
-void LRNLayerImpl::spatialNormalization_(Blob &srcBlob, Blob &dstBlob)
-{
-    int num = srcBlob.num();
-    int channels = srcBlob.channels();
-    int sizeNormFactor = normBySize ? size*size : 1;
+    void spatialNormalization(Mat &srcBlob, Mat &dstBlob)
+    {
+        int num = srcBlob.size[0];
+        int channels = srcBlob.size[1];
+        int sizeNormFactor = normBySize ? size*size : 1;
 
-    XMat srcMat = srcBlob.getRefConst<XMat>();
-    XMat dstMat = dstBlob.getRef<XMat>();
+        Mat srcMat = srcBlob;
+        Mat dstMat = dstBlob;
 
-    for (int n = 0; n < num; n++)
-    {
-        for (int cn = 0; cn < channels; cn++)
+        for (int n = 0; n < num; n++)
         {
-            XMat src = getPlane(srcMat, n, cn);
-            XMat dst = getPlane(dstMat, n, cn);
+            for (int cn = 0; cn < channels; cn++)
+            {
+                Mat src = getPlane(srcMat, n, cn);
+                Mat dst = getPlane(dstMat, n, cn);
 
-            sqrBoxFilter_(src, dst);
+                sqrBoxFilter_(src, dst);
 
-            dst.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
-            cv::pow(dst, beta, dst);
-            cv::divide(src, dst, dst);
+                dst.convertTo(dst, dst.type(), alpha/sizeNormFactor, bias);
+                cv::pow(dst, beta, dst);
+                cv::divide(src, dst, dst);
+            }
         }
     }
-}
 
+    Mat buf;
+};
 
-Ptr<LRNLayer> LRNLayer::create(int type, int size, double alpha, double beta, double bias,
-                               bool normBySize)
+Ptr<LRNLayer> LRNLayer::create(const LayerParams& params)
 {
-    return Ptr<LRNLayer>(new LRNLayerImpl(type, size, alpha, beta, bias, normBySize));
+    return Ptr<LRNLayer>(new LRNLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/lrn_layer.hpp b/modules/dnn/src/layers/lrn_layer.hpp
deleted file mode 100644
index cbdebb88eae..00000000000
--- a/modules/dnn/src/layers/lrn_layer.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_LRN_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class LRNLayerImpl : public LRNLayer
-{
-    bool useOpenCL;
-    Blob buf;
-
-    void channelNoramlization(Blob &src, Blob &dst);
-    template<typename XMat>
-    void channelNormalization_(Blob &src, Blob &dst);
-    bool channelNormalization_ocl(const UMat &src, UMat &dst);
-
-    void spatialNormalization(Blob &src, Blob &dst);
-    template<typename XMat>
-    void spatialNormalization_(Blob &src, Blob &dst);
-    template<typename XMat>
-    void sqrBoxFilter_(const XMat &src, XMat &dst);
-
-public:
-
-    LRNLayerImpl(int type = CHANNEL_NRM, int size = 5, double alpha = 1,
-                 double beta = 0.75, double bias = 1, bool normBySize = true);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-
-#endif
diff --git a/modules/dnn/src/layers/max_unpooling_layer.cpp b/modules/dnn/src/layers/max_unpooling_layer.cpp
index 71283fc8afd..c42e110a456 100644
--- a/modules/dnn/src/layers/max_unpooling_layer.cpp
+++ b/modules/dnn/src/layers/max_unpooling_layer.cpp
@@ -9,64 +9,84 @@
 Implementation of Batch Normalization layer.
 */
 
-#include "max_unpooling_layer.hpp"
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
 {
 namespace dnn
 {
 
-MaxUnpoolLayerImpl::MaxUnpoolLayerImpl(Size poolKernel_, Size poolPad_, Size poolStride_):
-    poolKernel(poolKernel_),
-    poolPad(poolPad_),
-    poolStride(poolStride_)
-{}
-
-void MaxUnpoolLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+class MaxUnpoolLayerImpl : public MaxUnpoolLayer
 {
-    CV_Assert(inputs.size() == 2);
-    CV_Assert(inputs[0]->total() == inputs[1]->total());
-
-    BlobShape outShape = inputs[0]->shape();
-    outShape[2] = (outShape[2] - 1) * poolStride.height + poolKernel.height - 2 * poolPad.height;
-    outShape[3] = (outShape[3] - 1) * poolStride.width + poolKernel.width - 2 * poolPad.width;
+public:
+    MaxUnpoolLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        poolKernel = Size(params.get<int>("pool_k_w"), params.get<int>("pool_k_h"));
+        poolPad = Size(params.get<int>("pool_pad_w"), params.get<int>("pool_pad_h"));
+        poolStride = Size(params.get<int>("pool_stride_w"), params.get<int>("pool_stride_h"));
+    }
 
-    outputs.resize(1);
-    outputs[0].create(outShape);
-}
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() == 2);
+        const Mat& inp0 = *inputs[0];
+        CV_Assert(inp0.total() == inputs[1]->total());
+        CV_Assert(inp0.dims == 4);
 
-void MaxUnpoolLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 2);
-    Blob& input = *inputs[0];
-    Blob& indices = *inputs[1];
+        int outShape[] = { inp0.size[0], inp0.size[1], inp0.size[2], inp0.size[3] };
+        outShape[2] = (outShape[2] - 1) * poolStride.height + poolKernel.height - 2 * poolPad.height;
+        outShape[3] = (outShape[3] - 1) * poolStride.width + poolKernel.width - 2 * poolPad.width;
 
-    CV_Assert(input.total() == indices.total());
-    CV_Assert(input.num() == 1);
+        outputs.resize(1);
+        outputs[0].create(4, outShape, inp0.type());
+    }
 
-    for(int i_n = 0; i_n < outputs.size(); i_n++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        Blob& outBlob = outputs[i_n];
-        outBlob.setTo(0);
-        CV_Assert(input.channels() == outBlob.channels());
+        CV_Assert(inputs.size() == 2);
+        Mat& input = *inputs[0];
+        Mat& indices = *inputs[1];
+
+        CV_Assert(input.total() == indices.total());
+        CV_Assert(input.size[0] == 1);
+        CV_Assert(input.isContinuous());
 
-        for (int i_c = 0; i_c < input.channels(); i_c++)
+        for(int i_n = 0; i_n < outputs.size(); i_n++)
         {
-            Mat outPlane = outBlob.getPlane(0, i_c);
-            for(int i_wh = 0; i_wh < input.size2().area(); i_wh++)
+            Mat& outBlob = outputs[i_n];
+            outBlob.setTo(0);
+            CV_Assert(input.size[1] == outBlob.size[1]);
+            int outPlaneTotal = outBlob.size[2]*outBlob.size[3];
+
+            for (int i_c = 0; i_c < input.size[1]; i_c++)
             {
-                int index = indices.getPlane(0, i_c).at<float>(i_wh);
+                Mat outPlane = getPlane(outBlob, 0, i_c);
+                int wh_area = input.size[2]*input.size[3];
+                const float* inptr = input.ptr<float>(0, i_c);
+                const float* idxptr = indices.ptr<float>(0, i_c);
+                float* outptr = outPlane.ptr<float>();
 
-                CV_Assert(index < outPlane.total());
-                outPlane.at<float>(index) = input.getPlane(0, i_c).at<float>(i_wh);
+                for(int i_wh = 0; i_wh < wh_area; i_wh++)
+                {
+                    int index = idxptr[i_wh];
+                    CV_Assert(0 <= index && index < outPlaneTotal);
+                    outptr[index] = inptr[i_wh];
+                }
             }
         }
     }
-}
 
-Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(Size poolKernel, Size poolPad, Size poolStride)
+    Size poolKernel;
+    Size poolPad;
+    Size poolStride;
+};
+
+Ptr<MaxUnpoolLayer> MaxUnpoolLayer::create(const LayerParams& params)
 {
-    return Ptr<MaxUnpoolLayer>(new MaxUnpoolLayerImpl(poolKernel, poolPad, poolStride));
+    return Ptr<MaxUnpoolLayer>(new MaxUnpoolLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/max_unpooling_layer.hpp b/modules/dnn/src/layers/max_unpooling_layer.hpp
deleted file mode 100644
index a73f460cee8..00000000000
--- a/modules/dnn/src/layers/max_unpooling_layer.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2016, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-/*
-Declaration of MaxUnpooling layer.
-*/
-
-#ifndef __OPENCV_DNN_LAYERS_MAX_UNPOOLING_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_MAX_UNPOOLING_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class MaxUnpoolLayerImpl : public MaxUnpoolLayer
-{
-public:
-    MaxUnpoolLayerImpl(Size poolKernel_, Size poolPad_, Size poolStride_);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    Size poolKernel;
-    Size poolPad;
-    Size poolStride;
-};
-
-}
-}
-#endif // __OPENCV_DNN_LAYERS_MAX_UNPOOLING_LAYER_HPP__
diff --git a/modules/dnn/src/layers/mvn_layer.cpp b/modules/dnn/src/layers/mvn_layer.cpp
index 36c48c40bf8..b658d9548ac 100644
--- a/modules/dnn/src/layers/mvn_layer.cpp
+++ b/modules/dnn/src/layers/mvn_layer.cpp
@@ -41,7 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "mvn_layer.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
@@ -49,52 +48,59 @@ namespace cv
 namespace dnn
 {
 
-MVNLayerImpl::MVNLayerImpl(bool normVariance_, bool acrossChannels_, double eps_)
+class MVNLayerImpl : public MVNLayer
 {
-    normVariance = normVariance_;
-    acrossChannels = acrossChannels_;
-    eps = eps_;
-}
+public:
+    MVNLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        normVariance = params.get<bool>("normalize_variance", true);
+        acrossChannels = params.get<bool>("across_channels", false);
+        eps = params.get<double>("eps", 1e-9);
+    }
 
-void MVNLayerImpl::allocate(const std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    outputs.resize(inputs.size());
-    for (size_t i = 0; i < inputs.size(); i++)
+    void allocate(const std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
-        CV_Assert(!acrossChannels || inputs[i]->dims() >= 2);
-        outputs[i].create(inputs[i]->shape(), inputs[i]->type());
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            int dims = inputs[i]->dims;
+            CV_Assert(!acrossChannels || dims >= 2);
+            outputs[i].create(dims, inputs[i]->size.p, inputs[i]->type());
+        }
     }
-}
 
-void MVNLayerImpl::forward(std::vector<Blob *> &inputs, std::vector<Blob> &outputs)
-{
-    for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+    void forward(std::vector<Mat *> &inputs, std::vector<Mat> &outputs)
     {
-        Blob &inpBlob = *inputs[inpIdx];
-        Blob &outBlob = outputs[inpIdx];
+        for (size_t inpIdx = 0; inpIdx < inputs.size(); inpIdx++)
+        {
+            Mat &inpBlob = *inputs[inpIdx];
+            Mat &outBlob = outputs[inpIdx];
 
-        int splitDim = (acrossChannels) ? 1 : 2;
-        Shape workSize((int)inpBlob.total(0, splitDim), (int)inpBlob.total(splitDim));
-        Mat inpMat = reshaped(inpBlob.matRefConst(), workSize);
-        Mat outMat = reshaped(outBlob.matRef(), workSize);
+            int splitDim = (acrossChannels) ? 1 : 2;
+            int i, newRows = 1;
+            for( i = 0; i < splitDim; i++ )
+                newRows *= inpBlob.size[i];
+            Mat inpMat = inpBlob.reshape(1, newRows);
+            Mat outMat = outBlob.reshape(1, newRows);
 
-        Scalar mean, dev;
-        for (int i = 0; i < workSize[0]; i++)
-        {
-            Mat inpRow = inpMat.row(i);
-            Mat outRow = outMat.row(i);
+            Scalar mean, dev;
+            for ( i = 0; i < newRows; i++)
+            {
+                Mat inpRow = inpMat.row(i);
+                Mat outRow = outMat.row(i);
 
-            cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
-            double alpha = (normVariance) ? 1/(eps + dev[0]) : 1;
-            inpRow.convertTo(outRow, outRow.type(), alpha, -mean[0] * alpha);
+                cv::meanStdDev(inpRow, mean, (normVariance) ? dev : noArray());
+                double alpha = (normVariance) ? 1/(eps + dev[0]) : 1;
+                inpRow.convertTo(outRow, outRow.type(), alpha, -mean[0] * alpha);
+            }
         }
     }
-}
-
+};
 
-Ptr<MVNLayer> MVNLayer::create(bool normVariance, bool acrossChannels, double eps)
+Ptr<MVNLayer> MVNLayer::create(const LayerParams& params)
 {
-    return Ptr<MVNLayer>(new MVNLayerImpl(normVariance, acrossChannels, eps));
+    return Ptr<MVNLayer>(new MVNLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/mvn_layer.hpp b/modules/dnn/src/layers/mvn_layer.hpp
deleted file mode 100644
index 80b89544b63..00000000000
--- a/modules/dnn/src/layers/mvn_layer.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_MVN_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_MVN_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class MVNLayerImpl : public MVNLayer
-{
-public:
-
-    MVNLayerImpl(bool normVariance_ = true, bool acrossChannels_ = false, double eps_ = 1e-9);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.cpp b/modules/dnn/src/layers/normalize_bbox_layer.cpp
index 926465b77fd..a14648785af 100644
--- a/modules/dnn/src/layers/normalize_bbox_layer.cpp
+++ b/modules/dnn/src/layers/normalize_bbox_layer.cpp
@@ -41,7 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "normalize_bbox_layer.hpp"
 #include "op_blas.hpp"
 
 #include <float.h>
@@ -52,150 +51,186 @@ namespace cv
 namespace dnn
 {
 
-const std::string NormalizeBBoxLayer::_layerName = std::string("NormalizeBBox");
-
-bool NormalizeBBoxLayer::getParameterDict(const LayerParams &params,
-                                          const std::string &parameterName,
-                                          DictValue& result)
+class NormalizeBBoxLayerImpl : public NormalizeBBoxLayer
 {
-    if (!params.has(parameterName))
-    {
-        return false;
-    }
+public:
+    Mat _buffer;
 
-    result = params.get(parameterName);
-    return true;
-}
+    Mat _sumChannelMultiplier;
+    Mat _sumSpatialMultiplier;
 
-template<typename T>
-T NormalizeBBoxLayer::getParameter(const LayerParams &params,
-                                   const std::string &parameterName,
-                                   const size_t &idx,
-                                   const bool required,
-                                   const T& defaultValue)
-{
-    DictValue dictValue;
-    bool success = getParameterDict(params, parameterName, dictValue);
-    if(!success)
+    Mat _scale;
+
+    float _eps;
+    bool _across_spatial;
+    bool _channel_shared;
+
+    size_t _num;
+    size_t _channels;
+    size_t _rows;
+    size_t _cols;
+
+    size_t _channelSize;
+    size_t _imageSize;
+
+    static const size_t _numAxes = 4;
+    static const std::string _layerName;
+
+    bool getParameterDict(const LayerParams &params,
+                          const std::string &parameterName,
+                          DictValue& result)
     {
-        if(required)
+        if (!params.has(parameterName))
         {
-            std::string message = _layerName;
-            message += " layer parameter does not contain ";
-            message += parameterName;
-            message += " parameter.";
-            CV_Error(Error::StsBadArg, message);
+            return false;
         }
-        else
-        {
-            return defaultValue;
-        }
-    }
-    return dictValue.get<T>(idx);
-}
 
-NormalizeBBoxLayer::NormalizeBBoxLayer(LayerParams &params) : Layer(params)
-{
-    _eps = getParameter<float>(params, "eps", 0, false, 1e-10f);
-    _across_spatial = getParameter<bool>(params, "across_spatial");
-    _channel_shared = getParameter<bool>(params, "channel_shared");
-}
+        result = params.get(parameterName);
+        return true;
+    }
 
-void NormalizeBBoxLayer::checkInputs(const std::vector<Blob*> &inputs)
-{
-    CV_Assert(inputs.size() > 0);
-    for (size_t i = 1; i < inputs.size(); i++)
+    template<typename T>
+    T getParameter(const LayerParams &params,
+                   const std::string &parameterName,
+                   const size_t &idx=0,
+                   const bool required=true,
+                   const T& defaultValue=T())
     {
-        for (size_t j = 0; j < _numAxes; j++)
+        DictValue dictValue;
+        bool success = getParameterDict(params, parameterName, dictValue);
+        if(!success)
         {
-            CV_Assert(inputs[i]->shape()[j] == inputs[0]->shape()[j]);
+            if(required)
+            {
+                std::string message = _layerName;
+                message += " layer parameter does not contain ";
+                message += parameterName;
+                message += " parameter.";
+                CV_Error(Error::StsBadArg, message);
+            }
+            else
+            {
+                return defaultValue;
+            }
         }
+        return dictValue.get<T>(idx);
     }
-    CV_Assert(inputs[0]->dims() > 2);
-}
-
-void NormalizeBBoxLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    checkInputs(inputs);
-
-    _num = inputs[0]->num();
-    _channels = inputs[0]->shape()[1];
-    _rows = inputs[0]->shape()[2];
-    _cols = inputs[0]->shape()[3];
-
-    _channelSize = _rows * _cols;
-    _imageSize = _channelSize * _channels;
 
-    _buffer = Mat(_channels, _channelSize, CV_32F);
-
-    _sumChannelMultiplier = Mat(_channels, 1, CV_32F, Scalar(1.0));
-    _sumSpatialMultiplier = Mat(1, _channelSize, CV_32F, Scalar(1.0));
-
-    _scale = blobs[0];
-
-    for(size_t i = 0; i < inputs.size(); i++)
+    NormalizeBBoxLayerImpl(const LayerParams &params)
     {
-        outputs[i].create(BlobShape(inputs[0]->shape()));
+        _eps = getParameter<float>(params, "eps", 0, false, 1e-10f);
+        _across_spatial = getParameter<bool>(params, "across_spatial");
+        _channel_shared = getParameter<bool>(params, "channel_shared");
+        setParamsFrom(params);
     }
-}
 
-void NormalizeBBoxLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    Mat zeroBuffer(_channels, _channelSize, CV_32F, Scalar(0));
-    Mat absDiff;
-
-    for (size_t j = 0; j < inputs.size(); j++)
+    void checkInputs(const std::vector<Mat*> &inputs)
     {
-        for (size_t n = 0; n < _num; ++n)
+        CV_Assert(inputs.size() > 0);
+        for (size_t i = 1; i < inputs.size(); i++)
         {
-            Mat src = Mat(_channels, _channelSize, CV_32F, inputs[j]->ptrf(n));
-            Mat dst = Mat(_channels, _channelSize, CV_32F, outputs[j].ptrf(n));
+            CV_Assert(inputs[i]->size == inputs[0]->size);
+        }
+        CV_Assert(inputs[0]->dims > 2);
+    }
 
-            _buffer = src.mul(src);
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        checkInputs(inputs);
 
-            if (_across_spatial)
-            {
-                absdiff(_buffer, zeroBuffer, absDiff);
+        const Mat& inp0 = *inputs[0];
+        CV_Assert(inp0.dims == 4 && inp0.type() == CV_32F);
 
-                // add eps to avoid overflow
-                double absSum = sum(absDiff)[0] + _eps;
+        _num = inp0.size[0];
+        _channels = inp0.size[1];
+        _rows = inp0.size[2];
+        _cols = inp0.size[3];
 
-                float norm = sqrt(absSum);
-                dst = src / norm;
-            }
-            else
-            {
-                Mat norm(_channelSize, 1, _buffer.type()); // 1 x _channelSize
+        _channelSize = _rows * _cols;
+        _imageSize = _channelSize * _channels;
 
-                // (_channels x_channelSize)T * _channels x 1 -> _channelSize x 1
-                gemmCPU(_buffer, _sumChannelMultiplier, 1, norm, 0, GEMM_1_T);
+        _buffer = Mat(_channels, _channelSize, CV_32F);
 
-                // compute norm
-                pow(norm, 0.5f, norm);
+        _sumChannelMultiplier = Mat(_channels, 1, CV_32F, Scalar(1.0));
+        _sumSpatialMultiplier = Mat(1, _channelSize, CV_32F, Scalar(1.0));
 
-                // scale the layer
-                // _channels x 1 * (_channelSize x 1)T -> _channels x _channelSize
-                gemmCPU(_sumChannelMultiplier, norm, 1, _buffer, 0, GEMM_2_T);
+        _scale = blobs[0];
+        size_t i, ninputs = inputs.size();
+        outputs.resize(ninputs);
 
-                dst = src / _buffer;
-            }
+        for(i = 0; i < ninputs; i++)
+        {
+            outputs[i].create(inp0.dims, inp0.size.p, inp0.type());
+        }
+    }
 
-            // scale the output
-            if (_channel_shared)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        Mat zeroBuffer(_channels, _channelSize, CV_32F, Scalar(0));
+        Mat absDiff;
+
+        for (size_t j = 0; j < inputs.size(); j++)
+        {
+            for (size_t n = 0; n < _num; ++n)
             {
-                // _scale: 1 x 1
-                dst *= _scale.matRefConst().at<float>(0, 0);
+                Mat src = Mat(_channels, _channelSize, CV_32F, inputs[j]->ptr<float>(n));
+                Mat dst = Mat(_channels, _channelSize, CV_32F, outputs[j].ptr<float>(n));
+
+                _buffer = src.mul(src);
+
+                if (_across_spatial)
+                {
+                    absdiff(_buffer, zeroBuffer, absDiff);
+
+                    // add eps to avoid overflow
+                    double absSum = sum(absDiff)[0] + _eps;
+
+                    float norm = sqrt(absSum);
+                    dst = src / norm;
+                }
+                else
+                {
+                    Mat norm(_channelSize, 1, _buffer.type()); // 1 x _channelSize
+
+                    // (_channels x_channelSize)T * _channels x 1 -> _channelSize x 1
+                    gemmCPU(_buffer, _sumChannelMultiplier, 1, norm, 0, GEMM_1_T);
+
+                    // compute norm
+                    pow(norm, 0.5f, norm);
+
+                    // scale the layer
+                    // _channels x 1 * (_channelSize x 1)T -> _channels x _channelSize
+                    gemmCPU(_sumChannelMultiplier, norm, 1, _buffer, 0, GEMM_2_T);
+
+                    dst = src / _buffer;
+                }
+
+                // scale the output
+                if (_channel_shared)
+                {
+                    // _scale: 1 x 1
+                    dst *= _scale.at<float>(0, 0);
+                }
+                else
+                {
+                    // _scale: _channels x 1
+                    // _channels x 1 * 1 x _channelSize -> _channels x _channelSize
+                    gemmCPU(_scale, _sumSpatialMultiplier, 1, _buffer, 0);
+
+                    dst = dst.mul(_buffer);
+                }
             }
-            else
-            {
-                // _scale: _channels x 1
-                // _channels x 1 * 1 x _channelSize -> _channels x _channelSize
-                gemmCPU(_scale.matRefConst(), _sumSpatialMultiplier, 1, _buffer, 0);
-
-                dst = dst.mul(_buffer);
-           }
         }
     }
+
+};
+
+const std::string NormalizeBBoxLayerImpl::_layerName = std::string("NormalizeBBox");
+
+Ptr<NormalizeBBoxLayer> NormalizeBBoxLayer::create(const LayerParams &params)
+{
+    return Ptr<NormalizeBBoxLayer>(new NormalizeBBoxLayerImpl(params));
 }
+
 }
 }
diff --git a/modules/dnn/src/layers/normalize_bbox_layer.hpp b/modules/dnn/src/layers/normalize_bbox_layer.hpp
deleted file mode 100644
index 825a0f8d997..00000000000
--- a/modules/dnn/src/layers/normalize_bbox_layer.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_NORMALIZEBBOX_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_NORMALIZEBBOX_LAYER_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-class NormalizeBBoxLayer : public Layer
-{
-    Mat _buffer;
-
-    Mat _sumChannelMultiplier;
-    Mat _sumSpatialMultiplier;
-
-    Blob _scale;
-
-    float _eps;
-    bool _across_spatial;
-    bool _channel_shared;
-
-    size_t _num;
-    size_t _channels;
-    size_t _rows;
-    size_t _cols;
-
-    size_t _channelSize;
-    size_t _imageSize;
-
-    static const size_t _numAxes = 4;
-    static const std::string _layerName;
-
-public:
-    NormalizeBBoxLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void checkInputs(const std::vector<Blob*> &inputs);
-
-    template<typename T>
-    T getParameter(const LayerParams &params,
-                   const std::string &parameterName,
-                   const size_t &idx = 0,
-                   const bool required = true,
-                   const T& defaultValue = T());
-
-    bool getParameterDict(const LayerParams &params,
-                          const std::string &parameterName,
-                          DictValue& result);
-};
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/op_blas.hpp b/modules/dnn/src/layers/op_blas.hpp
index 55c70d87dec..d9a264320bf 100644
--- a/modules/dnn/src/layers/op_blas.hpp
+++ b/modules/dnn/src/layers/op_blas.hpp
@@ -56,4 +56,4 @@ namespace dnn
     void gemmCPU(const Mat &A, const Mat &B, double alpha, Mat &C, double beta, int flags = 0);
 }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/modules/dnn/src/layers/op_im2col.cpp b/modules/dnn/src/layers/op_im2col.cpp
index 4adeec7b78d..bae2011d0e5 100644
--- a/modules/dnn/src/layers/op_im2col.cpp
+++ b/modules/dnn/src/layers/op_im2col.cpp
@@ -44,125 +44,3 @@
 #include "opencl_kernels_dnn.hpp"
 #include "op_im2col.hpp"
 #include "opencl_kernels_dnn.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-
-#ifdef HAVE_OPENCL
-
-bool im2col_ocl(const UMat &img,
-                 int channels, int height, int width,
-                 int kernel_h, int kernel_w,
-                 int pad_h, int pad_w,
-                 int stride_h, int stride_w,
-                 int dilation_h, int dilation_w,
-                 UMat &col)
-{
-    //TODO
-    CV_Assert(dilation_h == 1 && dilation_w == 1);
-
-    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-    int esz = img.elemSize();
-
-    CV_Assert(img.isContinuous() && col.isContinuous());
-    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
-
-    ocl::Kernel ker("im2col", ocl::dnn::im2col_oclsrc, String("-DT=") + ocl::typeToStr(img.type()));
-    if (ker.empty())
-        return false;
-
-    ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset/esz,
-             channels, height, width,
-             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-             height_col, width_col,
-             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset/esz
-             );
-
-    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t globalSize = (size_t)channels * height_col * width_col;
-    return ker.run(1, &globalSize, &localSize, true);
-}
-
-bool col2im_ocl(const UMat &col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                UMat &img)
-{
-    int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
-    int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
-    int channels_col = channels * kernel_h * kernel_w;
-    int esz = img.elemSize();
-
-    CV_Assert(img.isContinuous() && col.isContinuous());
-    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)channels_col * height_col * width_col);
-
-    ocl::Kernel ker("col2im", ocl::dnn::col2im_oclsrc, String("-DT=") + ocl::typeToStr(col.type()));
-    if (ker.empty())
-        return false;
-
-    ker.args((int)img.total(),
-             ocl::KernelArg::PtrReadOnly(col), (int)col.offset/esz,
-             height, width, channels,
-             kernel_h, kernel_w,
-             pad_h, pad_w,
-             stride_h, stride_w,
-             height_col, width_col,
-             ocl::KernelArg::PtrWriteOnly(img), (int)img.offset/esz);
-
-    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t globalSize = img.total();
-    return ker.run(1, &globalSize, &localSize, true);
-}
-
-#endif
-}
-}
-
-namespace cv
-{
-namespace dnn
-{
-
-#ifdef HAVE_OPENCL
-void im2col_ocl(UMat &img,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int height_out, int width_out,
-                UMat &col)
-{
-    int h_out = height_out;
-    int w_out = width_out;
-
-    CV_Assert(img.isContinuous() && col.isContinuous());
-    CV_Assert(img.total() == (size_t)channels * height * width);
-    CV_Assert(col.total() == (size_t)channels * kernel_h * kernel_w * h_out * w_out);
-
-    ocl::Kernel im2col_ker("im2col", ocl::dnn::im2col_oclsrc);
-    CV_Assert(!im2col_ker.empty());
-
-    im2col_ker.args(ocl::KernelArg::PtrReadOnly(img), (int)img.offset,
-             channels, height, width,
-             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-             h_out, w_out,
-             ocl::KernelArg::PtrWriteOnly(col), (int)col.offset
-        );
-
-    size_t localSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t globalSize = (size_t)channels * h_out * w_out;
-
-    CV_Assert(im2col_ker.run(1, &globalSize, &localSize, true));
-}
-#endif // HAVE_OPENCL
-
-}
-}
diff --git a/modules/dnn/src/layers/op_im2col.hpp b/modules/dnn/src/layers/op_im2col.hpp
index 3026991e26d..488fab30fc5 100644
--- a/modules/dnn/src/layers/op_im2col.hpp
+++ b/modules/dnn/src/layers/op_im2col.hpp
@@ -308,23 +308,6 @@ void col2im_cpu(const Dtype* data_col,
     }
 }
 
-#ifdef HAVE_OPENCL
-bool im2col_ocl(const UMat &img,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                int dilation_h, int dilation_w,
-                UMat &col);
-
-bool col2im_ocl(const UMat &col,
-                int channels, int height, int width,
-                int kernel_h, int kernel_w,
-                int pad_h, int pad_w,
-                int stride_h, int stride_w,
-                UMat &img);
-#endif
-
 }
 }
 
diff --git a/modules/dnn/src/layers/padding_layer.cpp b/modules/dnn/src/layers/padding_layer.cpp
index 0a682906f68..6704e76e882 100644
--- a/modules/dnn/src/layers/padding_layer.cpp
+++ b/modules/dnn/src/layers/padding_layer.cpp
@@ -9,7 +9,7 @@
 Implementation of padding layer, which adds paddings to input blob.
 */
 
-#include "padding_layer.hpp"
+#include "../precomp.hpp"
 #include <vector>
 
 namespace cv
@@ -17,69 +17,89 @@ namespace cv
 namespace dnn
 {
 
-PaddingLayer::PaddingLayer(LayerParams &params)
+class PaddingLayerImpl : public PaddingLayer
 {
-    paddingDim = params.get<int>("padding_dim");
-    padding = abs(params.get<int>("padding"));
-    inputDims = params.get<int>("input_dims", 0);
-    index = params.get<int>("index", 0);
-    paddingValue = params.get<double>("value", 0);
-
-    if(paddingDim < 0 || padding < 0)
-        CV_Error(cv::Error::StsNotImplemented, "Negative padding and dim aren't supported");
-}
+public:
+    PaddingLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        paddingDim = params.get<int>("padding_dim");
+        padding = abs(params.get<int>("padding"));
+        inputDims = params.get<int>("input_dims", 0);
+        index = params.get<int>("index", 0);
+        paddingValue = params.get<double>("value", 0);
 
-void PaddingLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    outputs.resize(inputs.size());
-    for(int i = 0; i < inputs.size(); i++)
+        if(paddingDim < 0 || padding < 0)
+            CV_Error(cv::Error::StsNotImplemented, "Negative padding and dim aren't supported");
+    }
+
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        BlobShape shape = inputs[i]->shape();
-        int dim = getPadDim(shape);
-        CV_Assert(dim < shape.dims());
+        size_t i, ninputs = inputs.size();
+        outputs.resize(ninputs);
 
-        shape[dim] += padding;
-        outputs[i].create(shape);
+        for( i = 0; i < ninputs; i++ )
+        {
+            const Mat& inp = *inputs[i];
+            int dims = inp.dims;
+            std::vector<int> shape(inp.size.p, inp.size.p + dims);
+            int dim = getPadDim(shape);
+            CV_Assert(dim < dims);
+
+            shape[dim] += padding;
+            outputs[i].create(dims, &shape[0], inp.type());
+        }
     }
-}
 
-void PaddingLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    for(int i = 0; i < inputs.size(); i++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        outputs[i].matRef() = paddingValue;
-        BlobShape inShape = inputs[i]->shape();
-        BlobShape outShape = outputs[i].shape();
-        int dim = getPadDim(inShape);
+        for(int i = 0; i < inputs.size(); i++)
+        {
+            outputs[i] = paddingValue;
+            const Mat& inp = *inputs[i];
+            Mat& out = outputs[i];
+            int dims = inp.dims;
+            std::vector<int> inShape(inp.size.p, inp.size.p + dims);
+            std::vector<int> outShape(out.size.p, out.size.p + dims);
+            int dim = getPadDim(inShape);
 
-        int actualIndex = index;
-        if(index == 0)
-            actualIndex = inShape[dim];
+            int actualIndex = index;
+            if(index == 0)
+                actualIndex = inShape[dim];
 
-        std::vector<std::pair<Range, Range> > srcDstRanges;
-        srcDstRanges.push_back(std::make_pair(Range(0, actualIndex), Range(0, actualIndex)));
-        srcDstRanges.push_back(std::make_pair(Range(actualIndex, inShape[dim]),
-                                              Range(actualIndex + padding, outShape[dim])));
+            std::vector<std::pair<Range, Range> > srcDstRanges;
+            srcDstRanges.push_back(std::make_pair(Range(0, actualIndex), Range(0, actualIndex)));
+            srcDstRanges.push_back(std::make_pair(Range(actualIndex, inShape[dim]),
+                                                  Range(actualIndex + padding, outShape[dim])));
 
-        std::vector<Range> srcRanges(inShape.dims(), Range::all()), dstRanges = srcRanges;
+            std::vector<Range> srcRanges(dims, Range::all()), dstRanges = srcRanges;
 
-        for(int j = 0; j < srcDstRanges.size(); j++)
-        {
-            if(!srcDstRanges[j].first.empty())
+            for(int j = 0; j < srcDstRanges.size(); j++)
             {
-                srcRanges[dim] = srcDstRanges[j].first;
-                dstRanges[dim] = srcDstRanges[j].second;
-                Mat dst = outputs[i].matRef()(&dstRanges[0]);
-                Mat src = inputs[i]->matRef()(&srcRanges[0]).clone();
-                src.copyTo(dst);
+                if(!srcDstRanges[j].first.empty())
+                {
+                    srcRanges[dim] = srcDstRanges[j].first;
+                    dstRanges[dim] = srcDstRanges[j].second;
+                    Mat dst = out(&dstRanges[0]);
+                    Mat src = inp(&srcRanges[0]).clone();
+                    src.copyTo(dst);
+                }
             }
         }
     }
-}
 
-int PaddingLayer::getPadDim(const BlobShape& shape) const
+    int getPadDim(const std::vector<int>& shape) const
+    {
+        return inputDims > 0 && (int)shape.size() > inputDims ? paddingDim + 1 : paddingDim;
+    }
+
+    int paddingDim, padding, inputDims, index;
+    float paddingValue;
+};
+
+Ptr<PaddingLayer> PaddingLayer::create(const LayerParams &params)
 {
-    return inputDims > 0 && shape.dims() > inputDims ? paddingDim + 1 : paddingDim;
+    return Ptr<PaddingLayer>(new PaddingLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/padding_layer.hpp b/modules/dnn/src/layers/padding_layer.hpp
deleted file mode 100644
index 18de0961077..00000000000
--- a/modules/dnn/src/layers/padding_layer.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2016, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-/*
-Declaration of padding layer, which adds paddings to input blob.
-*/
-
-#ifndef __OPENCV_DNN_LAYERS_PADDING_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_PADDING_LAYER_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-
-class PaddingLayer : public Layer
-{
-public:
-    PaddingLayer() {}
-    PaddingLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    int getPadDim(const BlobShape& shape) const;
-    int paddingDim, padding, inputDims, index;
-    float paddingValue;
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/permute_layer.cpp b/modules/dnn/src/layers/permute_layer.cpp
index 41c8399cec3..18943c626bc 100644
--- a/modules/dnn/src/layers/permute_layer.cpp
+++ b/modules/dnn/src/layers/permute_layer.cpp
@@ -41,7 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "permute_layer.hpp"
 #include <float.h>
 #include <algorithm>
 
@@ -49,137 +48,184 @@ namespace cv
 {
 namespace dnn
 {
-void PermuteLayer::checkCurrentOrder(int currentOrder)
+class PermuteLayerImpl : public PermuteLayer
 {
-    if(currentOrder < 0 || currentOrder > 3)
+public:
+    void checkCurrentOrder(int currentOrder)
     {
-        CV_Error(
-            Error::StsBadArg,
-            "Orders of dimensions in Permute layer parameter"
-            "must be in [0...3] interval");
-    }
-
-    if(std::find(_order.begin(), _order.end(), currentOrder) != _order.end())
-    {
-        CV_Error(Error::StsBadArg,
-                 "Permute layer parameter contains duplicated orders.");
-    }
-}
+        if(currentOrder < 0 || currentOrder > 3)
+        {
+            CV_Error(
+                     Error::StsBadArg,
+                     "Orders of dimensions in Permute layer parameter"
+                     "must be in [0...3] interval");
+        }
 
-void PermuteLayer::checkNeedForPermutation()
-{
-    _needsPermute = false;
-    for (size_t i = 0; i < _numAxes; ++i)
-    {
-        if (_order[i] != i)
+        if(std::find(_order.begin(), _order.end(), currentOrder) != _order.end())
         {
-            _needsPermute = true;
-            break;
+            CV_Error(Error::StsBadArg,
+                     "Permute layer parameter contains duplicated orders.");
         }
     }
-}
 
-PermuteLayer::PermuteLayer(LayerParams &params) : Layer(params)
-{
-    if (!params.has("order"))
+    void checkNeedForPermutation()
     {
         _needsPermute = false;
-        return;
-    }
-
-    DictValue paramOrder = params.get("order");
-    if(paramOrder.size() > 4)
-    {
-        CV_Error(
-            Error::StsBadArg,
-            "Too many (> 4) orders of dimensions in Permute layer");
+        for (size_t i = 0; i < _numAxes; ++i)
+        {
+            if (_order[i] != i)
+            {
+                _needsPermute = true;
+                break;
+            }
+        }
     }
 
-    _numAxes = paramOrder.size();
-
-    for (size_t i = 0; i < _numAxes; i++)
+    PermuteLayerImpl(const LayerParams &params)
     {
-        int currentOrder = paramOrder.get<int>(i);
-        checkCurrentOrder(currentOrder);
-        _order.push_back(currentOrder);
-    }
+        if (!params.has("order"))
+        {
+            _needsPermute = false;
+            return;
+        }
 
-    checkNeedForPermutation();
-}
+        DictValue paramOrder = params.get("order");
+        if(paramOrder.size() > 4)
+        {
+            CV_Error(
+                     Error::StsBadArg,
+                     "Too many (> 4) orders of dimensions in Permute layer");
+        }
 
-void PermuteLayer::computeStrides()
-{
-    _oldStride.resize(_numAxes);
-    _newStride.resize(_numAxes);
+        _numAxes = paramOrder.size();
 
-    _oldStride[_numAxes - 1] = 1;
-    _newStride[_numAxes - 1] = 1;
+        for (size_t i = 0; i < _numAxes; i++)
+        {
+            int currentOrder = paramOrder.get<int>(i);
+            checkCurrentOrder(currentOrder);
+            _order.push_back(currentOrder);
+        }
 
-    for(int i = _numAxes - 2; i >= 0; i--)
-    {
-        _oldStride[i] = _oldStride[i + 1] * _oldDimensionSize[i + 1];
-        _newStride[i] = _newStride[i + 1] * _newDimensionSize[i + 1];
+        setParamsFrom(params);
+        checkNeedForPermutation();
     }
 
-    _count = _oldStride[0] * _oldDimensionSize[0];
-}
-
-void PermuteLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    if(!_needsPermute)
+    void computeStrides()
     {
-        return;
-    }
+        _oldStride.resize(_numAxes);
+        _newStride.resize(_numAxes);
 
-    CV_Assert(inputs.size() > 0);
-    CV_Assert((int)_numAxes == inputs[0]->shape().dims());
+        _oldStride[_numAxes - 1] = 1;
+        _newStride[_numAxes - 1] = 1;
 
-    outputs.resize(inputs.size());
+        for(int i = _numAxes - 2; i >= 0; i--)
+        {
+            _oldStride[i] = _oldStride[i + 1] * _oldDimensionSize[i + 1];
+            _newStride[i] = _newStride[i + 1] * _newDimensionSize[i + 1];
+        }
 
-    _oldDimensionSize = inputs[0]->shape();
-    for (size_t i = 0; i < _numAxes; i++)
-    {
-        _newDimensionSize[i] = _oldDimensionSize[_order[i]];
+        _count = _oldStride[0] * _oldDimensionSize[0];
     }
 
-    for (size_t i = 0; i < inputs.size(); i++)
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        CV_Assert(inputs[i]->rows() == _oldDimensionSize[2] && inputs[i]->cols() == _oldDimensionSize[3]);
-        outputs[i].create(BlobShape(_newDimensionSize));
-    }
+        if(!_needsPermute)
+        {
+            return;
+        }
 
-    computeStrides();
-}
+        CV_Assert(inputs.size() > 0);
+        const Mat& inp0 = *inputs[0];
+        CV_Assert((int)_numAxes == inp0.dims);
 
-void PermuteLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    if(!_needsPermute)
-    {
-        for (size_t j = 0; j < inputs.size(); j++)
+        outputs.resize(inputs.size());
+
+        _newDimensionSize.resize(_numAxes);
+        _oldDimensionSize.resize(_numAxes);
+
+        for (size_t i = 0; i < _numAxes; i++)
         {
-            outputs[j].matRef() = inputs[j]->matRef();
+            _oldDimensionSize[i] = inp0.size[i];
+            _newDimensionSize[i] = inp0.size[_order[i]];
         }
-        return;
+
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            CV_Assert(inputs[i]->size == inp0.size);
+            outputs[i].create(_numAxes, &_newDimensionSize[0], CV_32F);
+        }
+
+        computeStrides();
     }
 
-    for (size_t k = 0; k < inputs.size(); k++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        float *srcData = inputs[k]->ptrf();
-        float *dstData = outputs[k].ptrf();
-
-        for (size_t i = 0; i < _count; ++i)
+        size_t k, ninputs = inputs.size();
+        if(!_needsPermute)
+        {
+            for (k = 0; k < ninputs; k++)
+                outputs[k] = *inputs[k];
+        }
+        else
         {
-            int oldPosition = 0;
-            int newPosition = i;
+            size_t i, j, count = _count, numAxes = _numAxes;
+            const size_t* newStride = &_newStride[0];
+            const size_t* oldStride = &_oldStride[0];
+            const size_t* order = &_order[0];
 
-            for (size_t j = 0; j < _numAxes; ++j)
+            for (k = 0; k < ninputs; k++)
             {
-                oldPosition += (newPosition / _newStride[j]) * _oldStride[_order[j]];
-                newPosition %= _newStride[j];
+                const Mat& inp = *inputs[k];
+                Mat& out = outputs[k];
+
+                CV_Assert(inp.dims == numAxes && inp.size == inputs[0]->size);
+                CV_Assert(out.dims == numAxes && out.size == outputs[0].size);
+
+                for( i = 0; i < numAxes; i++ )
+                {
+                    CV_Assert(inp.size[i] == _oldDimensionSize[i]);
+                    CV_Assert(out.size[i] == _newDimensionSize[i]);
+                }
+
+                CV_Assert(inp.isContinuous() && out.isContinuous());
+                CV_Assert(inp.type() == CV_32F && out.type() == CV_32F);
+
+                const float *srcData = inp.ptr<float>();
+                float *dstData = out.ptr<float>();
+
+                for (i = 0; i < count; ++i)
+                {
+                    size_t oldPosition = 0;
+                    size_t newPosition = i;
+
+                    for (j = 0; j < numAxes; ++j)
+                    {
+                        oldPosition += (newPosition / newStride[j]) * oldStride[order[j]];
+                        newPosition %= newStride[j];
+                    }
+                    dstData[i] = srcData[oldPosition];
+                }
             }
-            dstData[i] = srcData[oldPosition];
         }
     }
+
+    size_t _count;
+    std::vector<size_t> _order;
+
+    std::vector<int> _oldDimensionSize;
+    std::vector<int> _newDimensionSize;
+
+    std::vector<size_t> _oldStride;
+    std::vector<size_t> _newStride;
+    bool _needsPermute;
+
+    size_t _numAxes;
+};
+
+Ptr<PermuteLayer> PermuteLayer::create(const LayerParams &params)
+{
+    return Ptr<PermuteLayer>(new PermuteLayerImpl(params));
 }
+
 }
 }
diff --git a/modules/dnn/src/layers/permute_layer.hpp b/modules/dnn/src/layers/permute_layer.hpp
deleted file mode 100644
index cc51c605585..00000000000
--- a/modules/dnn/src/layers/permute_layer.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_PERMUTE_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_PERMUTE_LAYER_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-class PermuteLayer : public Layer
-{
-    size_t _count;
-    std::vector<size_t> _order;
-
-    BlobShape _oldDimensionSize;
-    BlobShape _newDimensionSize;
-
-    std::vector<size_t> _oldStride;
-    std::vector<size_t> _newStride;
-    bool _needsPermute;
-
-    size_t _numAxes;
-
-    void checkCurrentOrder(int currentOrder);
-    void checkNeedForPermutation();
-    void computeStrides();
-
-public:
-    PermuteLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/pooling_layer.cpp b/modules/dnn/src/layers/pooling_layer.cpp
index 9aaee31ead2..e37addd70e2 100644
--- a/modules/dnn/src/layers/pooling_layer.cpp
+++ b/modules/dnn/src/layers/pooling_layer.cpp
@@ -41,11 +41,8 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "pooling_layer.hpp"
-#include "opencl_kernels_dnn.hpp"
 #include <float.h>
 #include <algorithm>
-#include <opencv2/core/ocl.hpp>
 using std::max;
 using std::min;
 
@@ -53,273 +50,199 @@ namespace cv
 {
 namespace dnn
 {
-//TODO: add ceil_mode param
-
-PoolingLayerImpl::PoolingLayerImpl()
-{
-    globalPooling = false;
-}
 
-PoolingLayerImpl::PoolingLayerImpl(int type_, Size kernel_, Size stride_, Size pad_, const String &padMode_)
-{
-    globalPooling = false;
-    type = type_;
-    kernel = kernel_;
-    pad = pad_;
-    stride = stride_;
-    padMode = padMode_;
-}
-
-void PoolingLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+//TODO: add ceil_mode param
+class PoolingLayerImpl : public PoolingLayer
 {
-    CV_Assert(inputs.size() == 1);
+public:
+    PoolingLayerImpl(const LayerParams& params)
+    {
+        type = PoolingLayer::MAX;
 
-    inp = inputs[0]->size2();
+        if (params.has("pool"))
+        {
+            String pool = params.get<String>("pool").toLowerCase();
+            if (pool == "max")
+                type = PoolingLayer::MAX;
+            else if (pool == "ave")
+                type = PoolingLayer::AVE;
+            else if (pool == "stochastic")
+                type = PoolingLayer::STOCHASTIC;
+            else
+                CV_Error(Error::StsBadArg, "Unknown pooling type \"" + pool + "\"");
+        }
 
-    if(globalPooling)
-    {
-        kernel = inp;
+        getPoolingKernelParams(params, kernel.height, kernel.width, globalPooling,
+                               pad.height, pad.width, stride.height, stride.width, padMode);
+        setParamsFrom(params);
     }
 
-    computeOutputShape(inp);
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() == 1);
 
-    useOpenCL = ocl::useOpenCL();
+        inp = Size(inputs[0]->size[3], inputs[0]->size[2]);
 
-    outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
-    for (size_t i = 0; i < inputs.size(); i++)
-    {
-        CV_Assert(inputs[i]->rows() == inp.height && inputs[i]->cols() == inp.width);
-        if (type == MAX)
+        if(globalPooling)
         {
-            outputs[2 * i].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), out.height, out.width));
-            outputs[2 * i + 1].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), out.height, out.width));
+            kernel = inp;
         }
-        else
+
+        computeOutputShape(inp);
+
+        outputs.resize(type == MAX ? 2 * inputs.size() : inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
         {
-           outputs[i].create(BlobShape(inputs[i]->num(), inputs[i]->channels(), out.height, out.width));
+            const Mat& inp_i = *inputs[i];
+            CV_Assert(inp_i.size[2] == inp.height && inp_i.size[3] == inp.width);
+            int outsz[] = { inp_i.size[0], inp_i.size[1], out.height, out.width };
+
+            if (type == MAX)
+            {
+                outputs[2 * i].create(4, outsz, CV_32F);
+                outputs[2 * i + 1].create(4, outsz, CV_32F);
+            }
+            else
+            {
+                outputs[i].create(4, outsz, CV_32F);
+            }
         }
     }
-}
 
-void PoolingLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    for (size_t ii = 0; ii < inputs.size(); ii++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        switch (type)
+        for (size_t ii = 0; ii < inputs.size(); ii++)
         {
-        case MAX:
-            maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
-            break;
-        case AVE:
-            avePooling(*inputs[ii], outputs[ii]);
-            break;
-        default:
-            CV_Error(Error::StsNotImplemented, "Not implemented");
-            break;
+            switch (type)
+            {
+                case MAX:
+                    maxPooling(*inputs[ii], outputs[2 * ii], outputs[2 * ii + 1]);
+                    break;
+                case AVE:
+                    avePooling(*inputs[ii], outputs[ii]);
+                    break;
+                default:
+                    CV_Error(Error::StsNotImplemented, "Not implemented");
+                    break;
+            }
         }
     }
-}
-
-void PoolingLayerImpl::maxPooling(Blob &src, Blob &dst, Blob &mask)
-{
-    if (!useOpenCL)
-        maxPooling_cpu(src, dst, mask);
-    else
-    {
-        CV_Assert(maxPooling_ocl(src, dst, mask));
-    }
-}
 
-bool PoolingLayerImpl::maxPooling_ocl(Blob &src, Blob &dst, Blob &mask)
-{
-    return pooling_ocl("MaxPoolForward", src, dst, &mask);
-}
-
-void PoolingLayerImpl::avePooling(Blob &src, Blob &dst)
-{
-    if (!useOpenCL)
-        avePooling_cpu(src, dst);
-    else
+    void maxPooling(Mat &src, Mat &dst, Mat &mask)
     {
-        CV_Assert(avePooling_ocl(src, dst));
-    }
-}
-
-bool PoolingLayerImpl::avePooling_ocl(Blob &src, Blob &dst)
-{
-    return pooling_ocl("AvePoolForward", src, dst);
-}
+        CV_DbgAssert(dst.size[2] == out.height && dst.size[3] == out.width);
 
-void PoolingLayerImpl::maxPooling_cpu(Blob &src, Blob &dst, Blob &mask)
-{
-    CV_DbgAssert(dst.rows() == out.height && dst.cols() == out.width);
-
-    for (int n = 0; n < src.num(); ++n)
-    {
-        for (int c = 0; c < src.channels(); ++c)
+        for (int n = 0; n < src.size[0]; ++n)
         {
-            const float *srcData = src.ptrf(n, c);
-            float *dstData = dst.ptrf(n, c);
-            float *dstMaskData = mask.ptrf(n, c);
-
-            for (int ph = 0; ph < out.height; ++ph)
+            for (int c = 0; c < src.size[1]; ++c)
             {
-                for (int pw = 0; pw < out.width; ++pw)
-                {
-                    int hstart = ph * stride.height - pad.height;
-                    int wstart = pw * stride.width - pad.width;
-                    int hend = min(hstart + kernel.height, inp.height);
-                    int wend = min(wstart + kernel.width, inp.width);
-                    hstart = max(hstart, 0);
-                    wstart = max(wstart, 0);
-                    const int poolIndex = ph * out.width + pw;
-                    float max_val = -FLT_MAX;
-                    int max_index = -1;
+                const float *srcData = src.ptr<float>(n, c);
+                float *dstData = dst.ptr<float>(n, c);
+                float *dstMaskData = mask.ptr<float>(n, c);
 
-                    for (int h = hstart; h < hend; ++h)
-                        for (int w = wstart; w < wend; ++w)
-                        {
-                            const int index = h * inp.width + w;
-                            if (srcData[index] > max_val)
+                for (int ph = 0; ph < out.height; ++ph)
+                {
+                    for (int pw = 0; pw < out.width; ++pw)
+                    {
+                        int hstart = ph * stride.height - pad.height;
+                        int wstart = pw * stride.width - pad.width;
+                        int hend = min(hstart + kernel.height, inp.height);
+                        int wend = min(wstart + kernel.width, inp.width);
+                        hstart = max(hstart, 0);
+                        wstart = max(wstart, 0);
+                        const int poolIndex = ph * out.width + pw;
+                        float max_val = -FLT_MAX;
+                        int max_index = -1;
+
+                        for (int h = hstart; h < hend; ++h)
+                            for (int w = wstart; w < wend; ++w)
                             {
-                                max_val = srcData[index];
-                                max_index = index;
+                                const int index = h * inp.width + w;
+                                if (srcData[index] > max_val)
+                                {
+                                    max_val = srcData[index];
+                                    max_index = index;
+                                }
                             }
-                        }
 
-                    dstData[poolIndex] = max_val;
-                    dstMaskData[poolIndex] = max_index;
+                        dstData[poolIndex] = max_val;
+                        dstMaskData[poolIndex] = max_index;
+                    }
                 }
             }
         }
     }
-}
-
-
-#ifdef HAVE_OPENCL
-bool PoolingLayerImpl::pooling_ocl(const char *kname, const Blob &src, Blob &dst, Blob *mask)
-{
-    const UMat &srcMat = src.umatRefConst();
-    UMat &dstMat = dst.umatRef();
-    UMat *maskUMat = mask == NULL ? NULL : &mask->umatRef();
-    CV_Assert(maskUMat == NULL || maskUMat->type() == CV_32FC1); // FIXIT CV_32SC1
-    CV_Assert(maskUMat == NULL || maskUMat->offset == 0);
-
-    CV_Assert(srcMat.offset == 0 && dstMat.offset == 0);
 
-    ocl::Kernel ker(kname, ocl::dnn::pooling_oclsrc,
-        cv::format("-DT=%s%s", ocl::typeToStr(src.type()), maskUMat ? " -DMASK=1" : ""));
-    if (ker.empty())
-        return false;
-
-    BlobShape s = src.shape();
-    size_t nthreads = dst.total();
-    if (maskUMat)
-    {
-        ker.args((int)nthreads,
-             ocl::KernelArg::PtrReadOnly(srcMat), s[0], s[1], s[2], s[3],
-             out.height, out.width, kernel.height, kernel.width,
-             stride.height, stride.width, pad.height, pad.width,
-             ocl::KernelArg::PtrWriteOnly(dstMat),
-             ocl::KernelArg::PtrWriteOnly(*maskUMat));
-    }
-    else
+    void avePooling(Mat &src, Mat &dst)
     {
-        ker.args((int)nthreads,
-             ocl::KernelArg::PtrReadOnly(srcMat), s[0], s[1], s[2], s[3],
-             out.height, out.width, kernel.height, kernel.width,
-             stride.height, stride.width, pad.height, pad.width,
-             ocl::KernelArg::PtrWriteOnly(dstMat));
-    }
-
-    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
-    if (!ker.run(1, &nthreads, &wgSize, true))
-        return false;
-
-    return true;
-}
-#else
-bool PoolingLayerImpl::pooling_ocl(const char*, const Blob&, Blob&, Blob*)
-{
-    return false;
-}
-#endif
-
-void PoolingLayerImpl::avePooling_cpu(Blob &src, Blob &dst)
-{
-    for (int n = 0; n < src.num(); ++n)
-    {
-        for (int c = 0; c < src.channels(); ++c)
+        for (int n = 0; n < src.size[0]; ++n)
         {
-            const float *srcData = src.ptrf(n, c);
-            float *dstData = dst.ptrf(n, c);
-
-            for (int ph = 0; ph < out.height; ++ph)
+            for (int c = 0; c < src.size[1]; ++c)
             {
-                for (int pw = 0; pw < out.width; ++pw)
-                {
-                    int hstart = ph * stride.height - pad.height;
-                    int wstart = pw * stride.width - pad.width;
-                    int hend = min(hstart + kernel.height, inp.height + pad.height);
-                    int wend = min(wstart + kernel.width, inp.width + pad.width);
-                    int poolSize = (hend - hstart) * (wend - wstart);
-                    hstart = max(hstart, 0);
-                    wstart = max(wstart, 0);
-                    hend = min(hend, inp.height);
-                    wend = min(wend, inp.width);
-
-                    dstData[ph * out.width + pw] = 0.f;
+                const float *srcData = src.ptr<float>(n, c);
+                float *dstData = dst.ptr<float>(n, c);
 
-                    for (int h = hstart; h < hend; ++h)
-                        for (int w = wstart; w < wend; ++w)
-                            dstData[ph * out.width + pw] += srcData[h * inp.width + w];
-
-                    dstData[ph * out.width + pw] /= poolSize;
+                for (int ph = 0; ph < out.height; ++ph)
+                {
+                    for (int pw = 0; pw < out.width; ++pw)
+                    {
+                        int hstart = ph * stride.height - pad.height;
+                        int wstart = pw * stride.width - pad.width;
+                        int hend = min(hstart + kernel.height, inp.height + pad.height);
+                        int wend = min(wstart + kernel.width, inp.width + pad.width);
+                        int poolSize = (hend - hstart) * (wend - wstart);
+                        hstart = max(hstart, 0);
+                        wstart = max(wstart, 0);
+                        hend = min(hend, inp.height);
+                        wend = min(wend, inp.width);
+
+                        dstData[ph * out.width + pw] = 0.f;
+
+                        for (int h = hstart; h < hend; ++h)
+                            for (int w = wstart; w < wend; ++w)
+                                dstData[ph * out.width + pw] += srcData[h * inp.width + w];
+
+                        dstData[ph * out.width + pw] /= poolSize;
+                    }
                 }
             }
         }
     }
-}
-
-void PoolingLayerImpl::computeOutputShape(Size inpSz)
-{
-    if (padMode.empty()) {
-        //Yeah, something strange Caffe scheme-)
-        out.height = static_cast<int>(ceil(static_cast<float>(inpSz.height + 2 * pad.height -
-                                                              kernel.height) / stride.height)) + 1;
-        out.width = static_cast<int>(ceil(static_cast<float>(inpSz.width + 2 * pad.width -
-                                                             kernel.width) / stride.width)) + 1;
 
-        if (pad.height || pad.width)
+    void computeOutputShape(Size inpSz)
+    {
+        if (padMode.empty()) {
+            //Yeah, something strange Caffe scheme-)
+            out.height = static_cast<int>(ceil(static_cast<float>(inpSz.height + 2 * pad.height -
+                                                                  kernel.height) / stride.height)) + 1;
+            out.width = static_cast<int>(ceil(static_cast<float>(inpSz.width + 2 * pad.width -
+                                                                 kernel.width) / stride.width)) + 1;
+
+            if (pad.height || pad.width)
+            {
+                // If we have padding, ensure that the last pooling starts strictly
+                // inside the image (instead of at the padding); otherwise clip the last.
+                if ((out.height - 1) * stride.height >= inpSz.height + pad.height)
+                    --out.height;
+                if ((out.width - 1) * stride.width >= inpSz.width + pad.width)
+                    --out.width;
+                CV_Assert((out.height - 1) * stride.height < inpSz.height + pad.height);
+                CV_Assert((out.width - 1) * stride.width < inpSz.width + pad.width);
+            }
+        }
+        else
         {
-            // If we have padding, ensure that the last pooling starts strictly
-            // inside the image (instead of at the padding); otherwise clip the last.
-            if ((out.height - 1) * stride.height >= inpSz.height + pad.height)
-                --out.height;
-            if ((out.width - 1) * stride.width >= inpSz.width + pad.width)
-                --out.width;
-            CV_Assert((out.height - 1) * stride.height < inpSz.height + pad.height);
-            CV_Assert((out.width - 1) * stride.width < inpSz.width + pad.width);
+            getConvPoolOutParams(inpSz.height, inpSz.width, kernel, stride, pad,
+                                 padMode, out.height, out.width);
         }
     }
-    else
-    {
-        getConvPoolOutParams(inpSz.height, inpSz.width, kernel, stride, pad,
-                             padMode, out.height, out.width);
-    }
-}
 
-Ptr<PoolingLayer> PoolingLayer::create(int type, Size kernel, Size stride, Size pad,
-                                       const String& padMode)
-{
-    return Ptr<PoolingLayer>(new PoolingLayerImpl(type, kernel, stride, pad, padMode));
-}
+    Size inp, out;
+};
 
-Ptr<PoolingLayer> PoolingLayer::createGlobal(int type)
+Ptr<PoolingLayer> PoolingLayer::create(const LayerParams& params)
 {
-    Ptr<PoolingLayer> l = PoolingLayer::create(type);
-    l->globalPooling = true;
-    return l;
+    return Ptr<PoolingLayer>(new PoolingLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/pooling_layer.hpp b/modules/dnn/src/layers/pooling_layer.hpp
deleted file mode 100644
index 266db1c50a4..00000000000
--- a/modules/dnn/src/layers/pooling_layer.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_POOLING_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_POOLING_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class PoolingLayerImpl : public PoolingLayer
-{
-    bool useOpenCL;
-    Size inp, out;
-
-    void computeOutputShape(Size inpSz);
-
-    bool pooling_ocl(const char *kname, const Blob &src, Blob &dst, Blob *mask = NULL);
-
-    void maxPooling(Blob &src, Blob &dst, Blob &mask);
-    void maxPooling_cpu(Blob &src, Blob &dst, Blob &mask);
-    bool maxPooling_ocl(Blob &src, Blob &dst, Blob &mask);
-
-    void avePooling(Blob &src, Blob &dst);
-    void avePooling_cpu(Blob &src, Blob &dst);
-    bool avePooling_ocl(Blob &src, Blob &dst);
-
-public:
-
-    PoolingLayerImpl();
-    PoolingLayerImpl(int type, Size kernel, Size stride, Size pad, const String& padMode);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-
-#endif
diff --git a/modules/dnn/src/layers/prior_box_layer.cpp b/modules/dnn/src/layers/prior_box_layer.cpp
index a5343a41646..ee34485f9c5 100644
--- a/modules/dnn/src/layers/prior_box_layer.cpp
+++ b/modules/dnn/src/layers/prior_box_layer.cpp
@@ -41,7 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "prior_box_layer.hpp"
 #include <float.h>
 #include <algorithm>
 #include <cmath>
@@ -51,193 +50,182 @@ namespace cv
 namespace dnn
 {
 
-const std::string PriorBoxLayer::_layerName = std::string("PriorBox");
-
-bool PriorBoxLayer::getParameterDict(const LayerParams &params,
-                                     const std::string &parameterName,
-                                     DictValue& result)
-{
-    if (!params.has(parameterName))
-    {
-        return false;
-    }
-
-    result = params.get(parameterName);
-    return true;
-}
-
-template<typename T>
-T PriorBoxLayer::getParameter(const LayerParams &params,
-                              const std::string &parameterName,
-                              const size_t &idx,
-                              const bool required,
-                              const T& defaultValue)
+class PriorBoxLayerImpl : public PriorBoxLayer
 {
-    DictValue dictValue;
-    bool success = getParameterDict(params, parameterName, dictValue);
-    if(!success)
+public:
+    bool getParameterDict(const LayerParams &params,
+                          const std::string &parameterName,
+                          DictValue& result)
     {
-        if(required)
+        if (!params.has(parameterName))
         {
-            std::string message = _layerName;
-            message += " layer parameter does not contain ";
-            message += parameterName;
-            message += " parameter.";
-            CV_Error(Error::StsBadArg, message);
+            return false;
         }
-        else
-        {
-            return defaultValue;
-        }
-    }
-    return dictValue.get<T>(idx);
-}
 
-void PriorBoxLayer::getAspectRatios(const LayerParams &params)
-{
-    DictValue aspectRatioParameter;
-    bool aspectRatioRetieved = getParameterDict(params, "aspect_ratio", aspectRatioParameter);
-    CV_Assert(aspectRatioRetieved);
+        result = params.get(parameterName);
+        return true;
+    }
 
-    for (int i = 0; i < aspectRatioParameter.size(); ++i)
+    template<typename T>
+    T getParameter(const LayerParams &params,
+                   const std::string &parameterName,
+                   const size_t &idx=0,
+                   const bool required=true,
+                   const T& defaultValue=T())
     {
-        float aspectRatio = aspectRatioParameter.get<float>(i);
-        bool alreadyExists = false;
-
-        for (size_t j = 0; j < _aspectRatios.size(); ++j)
+        DictValue dictValue;
+        bool success = getParameterDict(params, parameterName, dictValue);
+        if(!success)
         {
-            if (fabs(aspectRatio - _aspectRatios[j]) < 1e-6)
+            if(required)
             {
-                alreadyExists = true;
-                break;
+                std::string message = _layerName;
+                message += " layer parameter does not contain ";
+                message += parameterName;
+                message += " parameter.";
+                CV_Error(Error::StsBadArg, message);
             }
-        }
-        if (!alreadyExists)
-        {
-            _aspectRatios.push_back(aspectRatio);
-            if (_flip)
+            else
             {
-                _aspectRatios.push_back(1./aspectRatio);
+                return defaultValue;
             }
         }
+        return dictValue.get<T>(idx);
     }
-}
-
-void PriorBoxLayer::getVariance(const LayerParams &params)
-{
-    DictValue varianceParameter;
-    bool varianceParameterRetrieved = getParameterDict(params, "variance", varianceParameter);
-    CV_Assert(varianceParameterRetrieved);
 
-    int varianceSize = varianceParameter.size();
-    if (varianceSize > 1)
+    void getAspectRatios(const LayerParams &params)
     {
-        // Must and only provide 4 variance.
-        CV_Assert(varianceSize == 4);
+        DictValue aspectRatioParameter;
+        bool aspectRatioRetieved = getParameterDict(params, "aspect_ratio", aspectRatioParameter);
+        CV_Assert(aspectRatioRetieved);
 
-        for (int i = 0; i < varianceSize; ++i)
+        for (int i = 0; i < aspectRatioParameter.size(); ++i)
         {
-            float variance = varianceParameter.get<float>(i);
-            CV_Assert(variance > 0);
-            _variance.push_back(variance);
+            float aspectRatio = aspectRatioParameter.get<float>(i);
+            bool alreadyExists = false;
+
+            for (size_t j = 0; j < _aspectRatios.size(); ++j)
+            {
+                if (fabs(aspectRatio - _aspectRatios[j]) < 1e-6)
+                {
+                    alreadyExists = true;
+                    break;
+                }
+            }
+            if (!alreadyExists)
+            {
+                _aspectRatios.push_back(aspectRatio);
+                if (_flip)
+                {
+                    _aspectRatios.push_back(1./aspectRatio);
+                }
+            }
         }
     }
-    else
+
+    void getVariance(const LayerParams &params)
     {
-        if (varianceSize == 1)
+        DictValue varianceParameter;
+        bool varianceParameterRetrieved = getParameterDict(params, "variance", varianceParameter);
+        CV_Assert(varianceParameterRetrieved);
+
+        int varianceSize = varianceParameter.size();
+        if (varianceSize > 1)
         {
-            float variance = varianceParameter.get<float>(0);
-            CV_Assert(variance > 0);
-            _variance.push_back(variance);
+            // Must and only provide 4 variance.
+            CV_Assert(varianceSize == 4);
+
+            for (int i = 0; i < varianceSize; ++i)
+            {
+                float variance = varianceParameter.get<float>(i);
+                CV_Assert(variance > 0);
+                _variance.push_back(variance);
+            }
         }
         else
         {
-            // Set default to 0.1.
-            _variance.push_back(0.1f);
+            if (varianceSize == 1)
+            {
+                float variance = varianceParameter.get<float>(0);
+                CV_Assert(variance > 0);
+                _variance.push_back(variance);
+            }
+            else
+            {
+                // Set default to 0.1.
+                _variance.push_back(0.1f);
+            }
         }
     }
-}
 
-PriorBoxLayer::PriorBoxLayer(LayerParams &params) : Layer(params)
-{
-    _minSize = getParameter<unsigned>(params, "min_size");
-    CV_Assert(_minSize > 0);
+    PriorBoxLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        _minSize = getParameter<unsigned>(params, "min_size");
+        CV_Assert(_minSize > 0);
 
-    _flip = getParameter<bool>(params, "flip");
-    _clip = getParameter<bool>(params, "clip");
+        _flip = getParameter<bool>(params, "flip");
+        _clip = getParameter<bool>(params, "clip");
 
-    _aspectRatios.clear();
-    _aspectRatios.push_back(1.);
+        _aspectRatios.clear();
+        _aspectRatios.push_back(1.);
 
-    getAspectRatios(params);
-    getVariance(params);
+        getAspectRatios(params);
+        getVariance(params);
 
-    _numPriors = _aspectRatios.size();
+        _numPriors = _aspectRatios.size();
 
-    _maxSize = -1;
-    if (params.has("max_size"))
-    {
-        _maxSize = params.get("max_size").get<float>(0);
-        CV_Assert(_maxSize > _minSize);
+        _maxSize = -1;
+        if (params.has("max_size"))
+        {
+            _maxSize = params.get("max_size").get<float>(0);
+            CV_Assert(_maxSize > _minSize);
 
-        _numPriors += 1;
+            _numPriors += 1;
+        }
     }
-}
 
-void PriorBoxLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 2);
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() == 2);
 
-    _layerWidth = inputs[0]->cols();
-    _layerHeight = inputs[0]->rows();
+        _layerWidth = inputs[0]->size[3];
+        _layerHeight = inputs[0]->size[2];
 
-    _imageWidth = inputs[1]->cols();
-    _imageHeight = inputs[1]->rows();
+        _imageWidth = inputs[1]->size[3];
+        _imageHeight = inputs[1]->size[2];
 
-    _stepX = static_cast<float>(_imageWidth) / _layerWidth;
-    _stepY = static_cast<float>(_imageHeight) / _layerHeight;
+        _stepX = static_cast<float>(_imageWidth) / _layerWidth;
+        _stepY = static_cast<float>(_imageHeight) / _layerHeight;
 
-    // Since all images in a batch has same height and width, we only need to
-    // generate one set of priors which can be shared across all images.
-    size_t outNum = 1;
-    // 2 channels. First channel stores the mean of each prior coordinate.
-    // Second channel stores the variance of each prior coordinate.
-    size_t outChannels = 2;
-    _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
+        // Since all images in a batch has same height and width, we only need to
+        // generate one set of priors which can be shared across all images.
+        int outNum = 1;
+        // 2 channels. First channel stores the mean of each prior coordinate.
+        // Second channel stores the variance of each prior coordinate.
+        int outChannels = 2;
+        _outChannelSize = _layerHeight * _layerWidth * _numPriors * 4;
 
-    outputs[0].create(BlobShape(outNum, outChannels, _outChannelSize));
-    outputs[0].matRef() = 0;
-}
+        int outsz[] = { outNum, outChannels, (int)_outChannelSize };
+        outputs[0].create(3, outsz, CV_32F);
+    }
 
-void PriorBoxLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    (void)inputs; // to suppress unused parameter warning
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        (void)inputs; // to suppress unused parameter warning
 
-    float* outputPtr = outputs[0].ptrf();
+        float* outputPtr = outputs[0].ptr<float>();
 
-    // first prior: aspect_ratio = 1, size = min_size
-    int idx = 0;
-    for (size_t h = 0; h < _layerHeight; ++h)
-    {
-        for (size_t w = 0; w < _layerWidth; ++w)
+        // first prior: aspect_ratio = 1, size = min_size
+        int idx = 0;
+        for (size_t h = 0; h < _layerHeight; ++h)
         {
-            _boxWidth = _boxHeight = _minSize;
-
-            float center_x = (w + 0.5) * _stepX;
-            float center_y = (h + 0.5) * _stepY;
-            // xmin
-            outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
-            // ymin
-            outputPtr[idx++] = (center_y - _boxHeight / 2.) / _imageHeight;
-            // xmax
-            outputPtr[idx++] = (center_x + _boxWidth / 2.) / _imageWidth;
-            // ymax
-            outputPtr[idx++] = (center_y + _boxHeight / 2.) / _imageHeight;
-
-            if (_maxSize > 0)
+            for (size_t w = 0; w < _layerWidth; ++w)
             {
-                // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
-                _boxWidth = _boxHeight = sqrt(_minSize * _maxSize);
+                _boxWidth = _boxHeight = _minSize;
+
+                float center_x = (w + 0.5) * _stepX;
+                float center_y = (h + 0.5) * _stepY;
                 // xmin
                 outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
                 // ymin
@@ -246,62 +234,112 @@ void PriorBoxLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outpu
                 outputPtr[idx++] = (center_x + _boxWidth / 2.) / _imageWidth;
                 // ymax
                 outputPtr[idx++] = (center_y + _boxHeight / 2.) / _imageHeight;
-            }
 
-            // rest of priors
-            for (size_t r = 0; r < _aspectRatios.size(); ++r)
-            {
-                float ar = _aspectRatios[r];
-                if (fabs(ar - 1.) < 1e-6)
+                if (_maxSize > 0)
                 {
-                    continue;
+                    // second prior: aspect_ratio = 1, size = sqrt(min_size * max_size)
+                    _boxWidth = _boxHeight = sqrt(_minSize * _maxSize);
+                    // xmin
+                    outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
+                    // ymin
+                    outputPtr[idx++] = (center_y - _boxHeight / 2.) / _imageHeight;
+                    // xmax
+                    outputPtr[idx++] = (center_x + _boxWidth / 2.) / _imageWidth;
+                    // ymax
+                    outputPtr[idx++] = (center_y + _boxHeight / 2.) / _imageHeight;
+                }
+
+                // rest of priors
+                for (size_t r = 0; r < _aspectRatios.size(); ++r)
+                {
+                    float ar = _aspectRatios[r];
+                    if (fabs(ar - 1.) < 1e-6)
+                    {
+                        continue;
+                    }
+                    _boxWidth = _minSize * sqrt(ar);
+                    _boxHeight = _minSize / sqrt(ar);
+                    // xmin
+                    outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
+                    // ymin
+                    outputPtr[idx++] = (center_y - _boxHeight / 2.) / _imageHeight;
+                    // xmax
+                    outputPtr[idx++] = (center_x + _boxWidth / 2.) / _imageWidth;
+                    // ymax
+                    outputPtr[idx++] = (center_y + _boxHeight / 2.) / _imageHeight;
                 }
-                _boxWidth = _minSize * sqrt(ar);
-                _boxHeight = _minSize / sqrt(ar);
-                // xmin
-                outputPtr[idx++] = (center_x - _boxWidth / 2.) / _imageWidth;
-                // ymin
-                outputPtr[idx++] = (center_y - _boxHeight / 2.) / _imageHeight;
-                // xmax
-                outputPtr[idx++] = (center_x + _boxWidth / 2.) / _imageWidth;
-                // ymax
-                outputPtr[idx++] = (center_y + _boxHeight / 2.) / _imageHeight;
             }
         }
-    }
-    // clip the prior's coordidate such that it is within [0, 1]
-    if (_clip)
-    {
-        for (size_t d = 0; d < _outChannelSize; ++d)
+        // clip the prior's coordidate such that it is within [0, 1]
+        if (_clip)
+        {
+            for (size_t d = 0; d < _outChannelSize; ++d)
+            {
+                outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            }
+        }
+        // set the variance.
+        outputPtr = outputs[0].ptr<float>(0, 1);
+        if(_variance.size() == 1)
         {
-            outputPtr[d] = std::min<float>(std::max<float>(outputPtr[d], 0.), 1.);
+            Mat secondChannel(outputs[0].size[2], outputs[0].size[3], CV_32F, outputPtr);
+            secondChannel.setTo(Scalar(_variance[0]));
         }
-    }
-    // set the variance.
-    outputPtr = outputs[0].ptrf(0, 1);
-    if(_variance.size() == 1)
-    {
-        Mat secondChannel(outputs[0].rows(), outputs[0].cols(), CV_32F, outputPtr);
-        secondChannel.setTo(Scalar(_variance[0]));
-    }
-    else
-    {
-        int count = 0;
-        for (size_t h = 0; h < _layerHeight; ++h)
+        else
         {
-            for (size_t w = 0; w < _layerWidth; ++w)
+            int count = 0;
+            for (size_t h = 0; h < _layerHeight; ++h)
             {
-                for (size_t i = 0; i < _numPriors; ++i)
+                for (size_t w = 0; w < _layerWidth; ++w)
                 {
-                    for (int j = 0; j < 4; ++j)
+                    for (size_t i = 0; i < _numPriors; ++i)
                     {
-                        outputPtr[count] = _variance[j];
-                        ++count;
+                        for (int j = 0; j < 4; ++j)
+                        {
+                            outputPtr[count] = _variance[j];
+                            ++count;
+                        }
                     }
                 }
             }
         }
     }
+
+    size_t _layerWidth;
+    size_t _layerHeight;
+
+    size_t _imageWidth;
+    size_t _imageHeight;
+
+    size_t _outChannelSize;
+
+    float _stepX;
+    float _stepY;
+
+    float _minSize;
+    float _maxSize;
+
+    float _boxWidth;
+    float _boxHeight;
+
+    std::vector<float> _aspectRatios;
+    std::vector<float> _variance;
+
+    bool _flip;
+    bool _clip;
+
+    size_t _numPriors;
+
+    static const size_t _numAxes = 4;
+    static const std::string _layerName;
+};
+
+const std::string PriorBoxLayerImpl::_layerName = std::string("PriorBox");
+
+Ptr<PriorBoxLayer> PriorBoxLayer::create(const LayerParams &params)
+{
+    return Ptr<PriorBoxLayer>(new PriorBoxLayerImpl(params));
 }
+
 }
 }
diff --git a/modules/dnn/src/layers/prior_box_layer.hpp b/modules/dnn/src/layers/prior_box_layer.hpp
deleted file mode 100644
index e398aa1650e..00000000000
--- a/modules/dnn/src/layers/prior_box_layer.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_PRIOR_BOX_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_PRIOR_BOX_LAYER_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-class PriorBoxLayer : public Layer
-{
-    size_t _layerWidth;
-    size_t _layerHeight;
-
-    size_t _imageWidth;
-    size_t _imageHeight;
-
-    size_t _outChannelSize;
-
-    float _stepX;
-    float _stepY;
-
-    float _minSize;
-    float _maxSize;
-
-    float _boxWidth;
-    float _boxHeight;
-
-    std::vector<float> _aspectRatios;
-    std::vector<float> _variance;
-
-    bool _flip;
-    bool _clip;
-
-    size_t _numPriors;
-
-    static const size_t _numAxes = 4;
-    static const std::string _layerName;
-
-public:
-    PriorBoxLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    template<typename T>
-    T getParameter(const LayerParams &params,
-                   const std::string &parameterName,
-                   const size_t &idx = 0,
-                   const bool required = true,
-                   const T& defaultValue = T());
-
-    bool getParameterDict(const LayerParams &params,
-                          const std::string &parameterName,
-                          DictValue& result);
-
-    void getAspectRatios(const LayerParams &params);
-    void getVariance(const LayerParams &params);
-};
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/recurrent_layers.cpp b/modules/dnn/src/layers/recurrent_layers.cpp
index 65545fee783..afb0d9ccf22 100644
--- a/modules/dnn/src/layers/recurrent_layers.cpp
+++ b/modules/dnn/src/layers/recurrent_layers.cpp
@@ -40,9 +40,9 @@
 //M*/
 
 #include "../precomp.hpp"
-#include "recurrent_layers.hpp"
 #include "op_blas.hpp"
 #include <iostream>
+#include <iterator>
 #include <cmath>
 #include <opencv2/dnn/shape_utils.hpp>
 
@@ -85,25 +85,25 @@ class LSTMLayerImpl : public LSTMLayer
     int numOut, numTimeStamps, numSamples, numInp;
     Mat hInternal, cInternal;
     Mat gates, dummyOnes;
-    int dtype;
     bool allocated;
 
-    Shape outTailShape;                 //shape of single output sample
-    Shape outTsMatShape, outTsShape;    //shape of N output samples
-    Shape outResShape;                  //shape of T timestamps and N output samples
+    std::vector<int> outTailShape;                 //shape of single output sample
+    std::vector<int> outTsMatShape, outTsShape;    //shape of N output samples
+    std::vector<int> outResShape;                  //shape of T timestamps and N output samples
 
     bool useTimestampDim;
     bool produceCellOutput;
 
 public:
 
-    LSTMLayerImpl()
+    LSTMLayerImpl(const LayerParams& params)
     {
+        setParamsFrom(params);
         type = "LSTM";
         useTimestampDim = true;
         produceCellOutput = false;
         allocated = false;
-        outTailShape = Shape::empty();
+        outTailShape.clear();
     }
 
     void setUseTimstampsDim(bool use)
@@ -118,146 +118,155 @@ class LSTMLayerImpl : public LSTMLayer
         produceCellOutput = produce;
     }
 
-    void setC(const Blob &C)
+    void setC(const Mat &C)
     {
-        CV_Assert(cInternal.empty() || C.total() == cInternal.total());
+        CV_Assert(C.type() == CV_32F);
         if (!cInternal.empty())
-            C.reshaped(Shape::like(cInternal)).matRefConst().copyTo(cInternal);
+        {
+            CV_Assert(C.total() == cInternal.total() && cInternal.isContinuous());
+            Mat cInternal_(C.dims, &C.size.p[0], C.type(), cInternal.ptr());
+            C.copyTo(cInternal_);
+        }
         else
-            C.matRefConst().copyTo(cInternal);
+            C.copyTo(cInternal);
     }
 
-    void setH(const Blob &H)
+    void setH(const Mat &H)
     {
-        CV_Assert(hInternal.empty() || H.total() == hInternal.total());
+        CV_Assert(H.type() == CV_32F);
         if (!hInternal.empty())
-            H.reshaped(Shape::like(hInternal)).matRefConst().copyTo(hInternal);
+        {
+            CV_Assert(H.total() == hInternal.total() && hInternal.isContinuous());
+            Mat hInternal_(H.dims, &H.size.p[0], H.type(), hInternal.ptr());
+            H.copyTo(hInternal_);
+        }
         else
-            H.matRefConst().copyTo(hInternal);
+            H.copyTo(hInternal);
     }
 
-    Blob getC() const
+    Mat getC() const
     {
-        CV_Assert(!cInternal.empty());
-
-        //TODO: add convinient Mat -> Blob constructor
-        Blob res(outTsShape, cInternal.type());
-        res.fill(res.shape(), res.type(), cInternal.data);
-        return res;
+        CV_Assert(shapeTotal(outTsShape) == cInternal.total());
+        return Mat((int)outTsShape.size(), &outTsShape[0], cInternal.type(), (char*)cInternal.ptr());
     }
 
-    Blob getH() const
+    Mat getH() const
     {
-        CV_Assert(!hInternal.empty());
-
-        Blob res(outTsShape, hInternal.type());
-        res.fill(res.shape(), res.type(), hInternal.data);
-        return res;
+        CV_Assert(shapeTotal(outTsShape) == hInternal.total());
+        return Mat((int)outTsShape.size(), &outTsShape[0], hInternal.type(), (char*)hInternal.ptr());
     }
 
-    void setOutShape(const Shape &outTailShape_)
+    void setOutShape(const std::vector<int> &outTailShape_)
     {
-        CV_Assert(!allocated || outTailShape_.total() == outTailShape.total());
+        CV_Assert(!allocated || shapeTotal(outTailShape) == shapeTotal(outTailShape_));
         outTailShape = outTailShape_;
     }
 
-    void setWeights(const Blob &Wh, const Blob &Wx, const Blob &bias)
+    void setWeights(const Mat &Wh, const Mat &Wx, const Mat &bias)
     {
-        CV_Assert(Wh.dims() == 2 && Wx.dims() == 2);
-        CV_Assert(Wh.size(0) == Wx.size(0));
-        CV_Assert(Wh.size(0) == 4*Wh.size(1));
-        CV_Assert(Wh.size(0) == (int)bias.total());
+        CV_Assert(Wh.dims == 2 && Wx.dims == 2);
+        CV_Assert(Wh.rows == Wx.rows);
+        CV_Assert(Wh.rows == 4*Wh.cols);
+        CV_Assert(Wh.rows == (int)bias.total());
         CV_Assert(Wh.type() == Wx.type() && Wx.type() == bias.type());
 
         blobs.resize(3);
-        blobs[0] = Wh;
-        blobs[1] = Wx;
-        blobs[2] = bias;
-        blobs[2].reshape(Shape(1, (int)bias.total()));
+        blobs[0] = Mat(Wh.clone());
+        blobs[1] = Mat(Wx.clone());
+        blobs[2] = Mat(bias.clone()).reshape(1, 1);
     }
 
-    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+    void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output)
     {
         CV_Assert(blobs.size() == 3);
         CV_Assert(input.size() == 1);
+        const Mat& inp0 = *input[0];
 
-        Blob &Wh = blobs[0], &Wx = blobs[1];
-        numOut = Wh.size(1);
-        numInp = Wx.size(1);
+        Mat &Wh = blobs[0], &Wx = blobs[1];
+        numOut = Wh.size[1];
+        numInp = Wx.size[1];
 
-        if (!outTailShape.isEmpty())
-            CV_Assert(outTailShape.total() == numOut);
+        if (!outTailShape.empty())
+            CV_Assert(shapeTotal(outTailShape) == numOut);
         else
-            outTailShape = Shape(numOut);
+            outTailShape.assign(1, numOut);
 
+        outResShape.clear();
         if (useTimestampDim)
         {
-            CV_Assert(input[0]->dims() >= 2 && (int)input[0]->total(2) == numInp);
-            numTimeStamps = input[0]->size(0);
-            numSamples = input[0]->size(1);
-            outResShape = Shape(numTimeStamps, numSamples) + outTailShape;
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(2) == numInp);
+            numTimeStamps = inp0.size[0];
+            numSamples = inp0.size[1];
+            outResShape.push_back(numTimeStamps);
         }
         else
         {
-            CV_Assert(input[0]->dims() >= 1 && (int)input[0]->total(1) == numInp);
+            CV_Assert(inp0.dims >= 2 && (int)inp0.total(1) == numInp);
             numTimeStamps = 1;
-            numSamples = input[0]->size(0);
-            outResShape = Shape(numSamples) + outTailShape;
+            numSamples = inp0.size[0];
         }
-        outTsMatShape = Shape(numSamples, numOut);
-        outTsShape = Shape(numSamples) + outTailShape;
 
-        dtype = input[0]->type();
-        CV_Assert(dtype == CV_32F || dtype == CV_64F);
-        CV_Assert(Wh.type() == dtype);
+        outResShape.push_back(numSamples);
+        outResShape.insert(outResShape.end(), outTailShape.begin(), outTailShape.end());
+
+        outTsMatShape.clear();
+        outTsMatShape.push_back(numSamples);
+        outTsMatShape.push_back(numOut);
+
+        outTsShape.clear();
+        outTsShape.push_back(numSamples);
+        outTsShape.insert(outTsShape.end(), outTailShape.begin(), outTailShape.end());
+
+        const int dtype = CV_32F;
+        CV_Assert(inp0.type() == dtype && Wh.type() == dtype);
 
-        output.resize( (produceCellOutput) ? 2 : 1 );
-        output[0].create(outResShape, dtype);
-        if (produceCellOutput)
-            output[1].create(outResShape, dtype);
+        size_t i, noutputs = produceCellOutput ? 2 : 1;
+        output.resize(noutputs);
+
+        for( i = 0; i < noutputs; i++ )
+            output[i].create(outResShape, dtype);
 
         if (hInternal.empty())
         {
-            hInternal.create(outTsMatShape.dims(), outTsMatShape.ptr(), dtype);
-            hInternal.setTo(0);
+            hInternal.create(outTsMatShape, dtype);
+            hInternal.setTo(0.);
         }
         else
         {
-            CV_Assert((int)hInternal.total() == numSamples*numOut);
-            hInternal = hInternal.reshape(1, outTsMatShape.dims(), outTsMatShape.ptr());
+            CV_Assert(hInternal.total() == (size_t)numSamples*numOut);
+            hInternal = hInternal.reshape(1, outTsMatShape);
         }
 
         if (cInternal.empty())
         {
-            cInternal.create(outTsMatShape.dims(), outTsMatShape.ptr(), dtype);
-            cInternal.setTo(0);
+            cInternal.create(outTsMatShape, dtype);
+            cInternal.setTo(0.);
         }
         else
         {
-            CV_Assert((int)cInternal.total() == numSamples*numOut);
-            cInternal = cInternal.reshape(1, outTsMatShape.dims(), outTsMatShape.ptr());
+            CV_Assert(cInternal.total() == (size_t)numSamples*numOut);
+            cInternal = cInternal.reshape(1, outTsMatShape);
         }
 
         gates.create(numSamples, 4*numOut, dtype);
 
         dummyOnes.create(numSamples, 1, dtype);
-        dummyOnes.setTo(1);
+        dummyOnes.setTo(1.);
 
         allocated = true;
     }
 
-    void forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output)
     {
-        const Mat &Wh = blobs[0].getRefConst<Mat>();
-        const Mat &Wx = blobs[1].getRefConst<Mat>();
-        const Mat &bias = blobs[2].getRefConst<Mat>();
+        const Mat &Wh = blobs[0];
+        const Mat &Wx = blobs[1];
+        const Mat &bias = blobs[2];
 
         int numSamplesTotal = numTimeStamps*numSamples;
-        Mat xTs = reshaped(input[0]->getRefConst<Mat>(), Shape(numSamplesTotal, numInp));
+        Mat xTs = input[0]->reshape(1, numSamplesTotal);
 
-        Shape outMatShape(numSamplesTotal, numOut);
-        Mat hOutTs = reshaped(output[0].getRef<Mat>(), outMatShape);
-        Mat cOutTs = (produceCellOutput) ? reshaped(output[1].getRef<Mat>(), outMatShape) : Mat();
+        Mat hOutTs = output[0].reshape(1, numSamplesTotal);
+        Mat cOutTs = produceCellOutput ? output[1].reshape(1, numSamplesTotal) : Mat();
 
         for (int ts = 0; ts < numTimeStamps; ts++)
         {
@@ -278,13 +287,13 @@ class LSTMLayerImpl : public LSTMLayer
             tanh(gateG, gateG);
 
             //compute c_t
-            cv::multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
-            cv::multiply(gateI, gateG, gateI);      // i_t (*) g_t
-            cv::add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
+            multiply(gateF, cInternal, gateF);  // f_t (*) c_{t-1}
+            multiply(gateI, gateG, gateI);      // i_t (*) g_t
+            add(gateF, gateI, cInternal);       // c_t = f_t (*) c_{t-1} + i_t (*) g_t
 
             //compute h_t
             tanh(cInternal, hInternal);
-            cv::multiply(gateO, hInternal, hInternal);
+            multiply(gateO, hInternal, hInternal);
 
             //save results in output blobs
             hInternal.copyTo(hOutTs.rowRange(curRowRange));
@@ -294,14 +303,9 @@ class LSTMLayerImpl : public LSTMLayer
     }
 };
 
-Ptr<LSTMLayer> LSTMLayer::create()
+Ptr<LSTMLayer> LSTMLayer::create(const LayerParams& params)
 {
-    return Ptr<LSTMLayer>(new LSTMLayerImpl());
-}
-
-void LSTMLayer::forward(std::vector<Blob*>&, std::vector<Blob>&)
-{
-    CV_Error(Error::StsInternal, "This function should be unreached");
+    return Ptr<LSTMLayer>(new LSTMLayerImpl(params));
 }
 
 int LSTMLayer::inputNameToIndex(String inputName)
@@ -333,8 +337,9 @@ class RNNLayerImpl : public RNNLayer
 
 public:
 
-    RNNLayerImpl()
+    RNNLayerImpl(const LayerParams& params)
     {
+        setParamsFrom(params);
         type = "RNN";
         produceH = false;
     }
@@ -344,68 +349,74 @@ class RNNLayerImpl : public RNNLayer
         produceH = produce;
     }
 
-    void setWeights(const Blob &W_xh, const Blob &b_h, const Blob &W_hh, const Blob &W_ho, const Blob &b_o)
+    void setWeights(const Mat &W_xh, const Mat &b_h, const Mat &W_hh, const Mat &W_ho, const Mat &b_o)
     {
-        CV_Assert(W_hh.dims() == 2 && W_xh.dims() == 2);
-        CV_Assert(W_hh.size(0) == W_xh.size(0) && W_hh.size(0) == W_hh.size(1) && (int)b_h.total() == W_xh.size(0));
-        CV_Assert(W_ho.size(0) == (int)b_o.total());
-        CV_Assert(W_ho.size(1) == W_hh.size(1));
+        CV_Assert(W_hh.dims == 2 && W_xh.dims == 2);
+        CV_Assert(W_hh.size[0] == W_xh.size[0] && W_hh.size[0] == W_hh.size[1] && (int)b_h.total() == W_xh.size[0]);
+        CV_Assert(W_ho.size[0] == (int)b_o.total());
+        CV_Assert(W_ho.size[1] == W_hh.size[1]);
 
         blobs.resize(5);
-        blobs[0] = W_xh;
-        blobs[1] = b_h;
-        blobs[2] = W_hh;
-        blobs[3] = W_ho;
-        blobs[4] = b_o;
+        blobs[0] = Mat(W_xh.clone());
+        blobs[1] = Mat(b_h.clone());
+        blobs[2] = Mat(W_hh.clone());
+        blobs[3] = Mat(W_ho.clone());
+        blobs[4] = Mat(b_o.clone());
     }
 
-    void allocate(const std::vector<Blob*> &input, std::vector<Blob> &output)
+    void allocate(const std::vector<Mat*> &input, std::vector<Mat> &output)
     {
         CV_Assert(input.size() >= 1 && input.size() <= 2);
 
-        Wxh = blobs[0].matRefConst();
-        bh  = blobs[1].matRefConst();
-        Whh = blobs[2].matRefConst();
-        Who = blobs[3].matRefConst();
-        bo  = blobs[4].matRefConst();
+        Wxh = blobs[0];
+        bh  = blobs[1];
+        Whh = blobs[2];
+        Who = blobs[3];
+        bo  = blobs[4];
 
         numH = Wxh.rows;
         numX = Wxh.cols;
         numO = Who.rows;
 
-        CV_Assert(input[0]->dims() >= 2);
-        CV_Assert((int)input[0]->total(2) == numX);
-        CV_Assert(input[0]->type() == CV_32F || input[0]->type() == CV_64F);
-        dtype = input[0]->type();
-        numTimestamps = input[0]->size(0);
-        numSamples = input[0]->size(1);
+        const Mat& inp0 = *input[0];
+
+        CV_Assert(inp0.dims >= 2);
+        CV_Assert(inp0.total(2) == numX);
+        dtype = CV_32F;
+        CV_Assert(inp0.type() == dtype);
+        numTimestamps = inp0.size[0];
+        numSamples = inp0.size[1];
         numSamplesTotal = numTimestamps * numSamples;
 
         hCurr.create(numSamples, numH, dtype);
         hPrev.create(numSamples, numH, dtype);
-        hPrev.setTo(0);
+        hPrev.setTo(0.);
 
         dummyBiasOnes.create(numSamples, 1, dtype);
-        dummyBiasOnes.setTo(1);
+        dummyBiasOnes.setTo(1.);
         bh = bh.reshape(1, 1); //is 1 x numH Mat
         bo = bo.reshape(1, 1); //is 1 x numO Mat
 
         reshapeOutput(output);
     }
 
-    void reshapeOutput(std::vector<Blob> &output)
+    void reshapeOutput(std::vector<Mat> &output)
     {
-        output.resize((produceH) ? 2 : 1);
-        output[0].create(Shape(numTimestamps, numSamples, numO), dtype);
+        output.resize(produceH ? 2 : 1);
+        int sz0[] = { numTimestamps, numSamples, numO };
+        output[0].create(3, sz0, dtype);
         if (produceH)
-            output[1].create(Shape(numTimestamps, numSamples, numH), dtype);
+        {
+            int sz1[] = { numTimestamps, numSamples, numH };
+            output[1].create(3, sz1, dtype);
+        }
     }
 
-    void forward(std::vector<Blob*> &input, std::vector<Blob> &output)
+    void forward(std::vector<Mat*> &input, std::vector<Mat> &output)
     {
-        Mat xTs = reshaped(input[0]->getRefConst<Mat>(), Shape(numSamplesTotal, numX));
-        Mat oTs = reshaped(output[0].getRef<Mat>(), Shape(numSamplesTotal, numO));
-        Mat hTs = (produceH) ? reshaped(output[1].getRef<Mat>(), Shape(numSamplesTotal, numH)) : Mat();
+        Mat xTs = input[0]->reshape(1, numSamplesTotal);
+        Mat oTs = output[0].reshape(1, numSamplesTotal);
+        Mat hTs = produceH ? output[1].reshape(1, numSamplesTotal) : Mat();
 
         for (int ts = 0; ts < numTimestamps; ts++)
         {
@@ -428,14 +439,9 @@ class RNNLayerImpl : public RNNLayer
     }
 };
 
-void RNNLayer::forward(std::vector<Blob*>&, std::vector<Blob>&)
-{
-    CV_Error(Error::StsInternal, "This function should be unreached");
-}
-
-CV_EXPORTS_W Ptr<RNNLayer> RNNLayer::create()
+CV_EXPORTS_W Ptr<RNNLayer> RNNLayer::create(const LayerParams& params)
 {
-    return Ptr<RNNLayer>(new RNNLayerImpl());
+    return Ptr<RNNLayer>(new RNNLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/recurrent_layers.hpp b/modules/dnn/src/layers/recurrent_layers.hpp
deleted file mode 100644
index 54451218428..00000000000
--- a/modules/dnn/src/layers/recurrent_layers.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_RECURRENT_LAYERS_HPP__
-#define __OPENCV_DNN_LAYERS_RECURRENT_LAYERS_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-}
-}
-#endif
\ No newline at end of file
diff --git a/modules/dnn/src/layers/reshape_layer.cpp b/modules/dnn/src/layers/reshape_layer.cpp
index e3f0d1f8be8..f10fee4eae0 100644
--- a/modules/dnn/src/layers/reshape_layer.cpp
+++ b/modules/dnn/src/layers/reshape_layer.cpp
@@ -41,7 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "reshape_layer.hpp"
 #include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
@@ -49,73 +48,158 @@ namespace cv
 namespace dnn
 {
 
-ReshapeLayerImpl::ReshapeLayerImpl(const BlobShape &newShape_, Range applyingRange_, bool enableReordering_) :
-    enableReordering(enableReordering_)
+static void computeShapeByReshapeMask(const std::vector<int> &srcShape,
+                                      const std::vector<int> &maskShape,
+                                      Range srcRange /*= Range::all()*/,
+                                      std::vector<int>& dstShape)
 {
-    newShapeDesc = newShape_;
-    newShapeRange = applyingRange_;
-}
+    int srcShapeSize = (int)srcShape.size();
+    int maskShapeSize = (int)maskShape.size();
 
-void ReshapeLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    outputs.resize(inputs.size());
-    outShapes.resize(inputs.size());
+    if (srcRange == Range::all())
+        srcRange = Range(0, srcShapeSize);
+    else
+    {
+        int sz = srcRange.size();
+        srcRange.start = srcRange.start < 0 ? srcRange.start + srcShapeSize : srcRange.start;
+        srcRange.end = srcRange.end == INT_MAX ? srcShapeSize : srcRange.start + sz;
+    }
+
+    CV_Assert(0 <= srcRange.start && srcRange.start <= srcRange.end && srcRange.end <= srcShapeSize);
+    int dstShapeSize = srcShapeSize - srcRange.size() + maskShapeSize;
+    dstShape.resize(dstShapeSize);
+
+    std::copy(srcShape.begin(), srcShape.begin() + srcRange.start, dstShape.begin());
+    std::copy(srcShape.begin() + srcRange.end, srcShape.begin() + srcShapeSize, dstShape.begin() + srcRange.start + maskShapeSize);
+
+    int inferDim = -1;
+    for (int i = 0; i < maskShapeSize; i++)
+    {
+        if (maskShape[i] > 0)
+        {
+            dstShape[srcRange.start + i] = maskShape[i];
+        }
+        else if (maskShape[i] == 0)
+        {
+            if (srcRange.start + i >= srcShapeSize)
+                CV_Error(Error::StsBadArg, format("Copy dim[%d] (which has zero size) is out of the source shape bounds", srcRange.start + i));
+            dstShape[srcRange.start + i] = srcShape[srcRange.start + i];
+        }
+        else if (maskShape[i] == -1)
+        {
+            if (inferDim != -1)
+                CV_Error(Error::StsAssert, "Duplicate of inferred dim (which is denoted by -1)");
+            inferDim = srcRange.start + i;
+            dstShape[inferDim] = 1;
+        }
+        else
+            CV_Error(Error::StsBadArg, "maskShape[i] >= -1");
+    }
+
+    size_t srcTotal = shapeTotal(srcShape);
+    size_t dstTotal = shapeTotal(dstShape);
+
+    if (inferDim != -1)
+    {
+        if (srcTotal % dstTotal != 0)
+            CV_Error(Error::StsBackTrace, "Can't infer a dim denoted by -1");
 
-    for (size_t i = 0; i < inputs.size(); i++)
+        dstShape[inferDim] = (int)(srcTotal / dstTotal);
+    }
+    else
     {
-        outShapes[i] = computeShapeByReshapeMask(inputs[i]->shape(), newShapeDesc, newShapeRange);
-        outputs[i].shareFrom(*inputs[i]);
-        outputs[i].reshape(outShapes[i]);
+        CV_Assert(srcTotal == dstTotal);
     }
 }
 
-void ReshapeLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+
+class ReshapeLayerImpl : public ReshapeLayer
 {
-    for (size_t i = 0; i < outputs.size(); i++)
+public:
+    ReshapeLayerImpl(const LayerParams& params)
     {
-        Blob srcBlob = *inputs[i];
-        BlobShape inputShape = inputs[i]->shape();
-        bool channelsReduced = inputShape.dims() > outShapes[i].dims() ||
-                (inputShape.dims() == 4 && inputShape[1] > outShapes[i][1]);
-        bool performReordering = enableReordering && inputShape.dims() == 4 && channelsReduced;
-
-        if (performReordering)
+        setParamsFrom(params);
+        int axis = params.get<int>("axis", 0);
+        int numAxes = params.get<int>("num_axes", -1);
+        enableReordering = params.get<bool>("reorder_dims", false);
+        CV_Assert(numAxes >= -1);
+        newShapeRange = (numAxes == -1) ? Range(axis, INT_MAX) : Range(axis, axis + numAxes);
+
+        newShapeDesc.clear();
+        if (params.has("dim"))
         {
-            Blob reordered_blob(inputShape, inputs[i]->type());
-
-            float *dstData = reordered_blob.matRef().ptr<float>();
-            const float *srcData = srcBlob.matRefConst().ptr<float>();
+            const DictValue &paramShape = params.get("dim");
+            int i, dims = paramShape.size();
+            newShapeDesc.resize(dims);
+            for (i = 0; i < dims; i++)
+                newShapeDesc[i] = paramShape.get<int>(i);
+        }
+    }
 
-            int num = inputShape[0], channels = inputShape[1], height = inputShape[2], width = inputShape[3];
-            int total = num*channels*height*width;
-            for(int i_n = 0; i_n < num; i_n++) {
-                for(int i_c = 0; i_c < channels; i_c++) {
-                    for(int i_h = 0; i_h < height; i_h++) {
-                        for(int i_w = 0; i_w < width; i_w++) {
-                           int src_i = channels*height*width*i_n + height*width*i_c + width*i_h + i_w;
-                           int dst_i = channels*height*width*i_n + i_c + channels*width*i_h + channels*i_w;
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        outputs.resize(inputs.size());
+        outShapes.resize(inputs.size());
 
-                           CV_Assert(dst_i < total);
-                           CV_Assert(src_i < total);
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            std::vector<int> inputShape(inputs[i]->size.p, inputs[i]->size.p + inputs[i]->dims);
+            computeShapeByReshapeMask(inputShape, newShapeDesc, newShapeRange, outShapes[i]);
+            outputs[i] = inputs[i]->reshape(1, outShapes[i]);
+        }
+    }
 
-                           dstData[dst_i] = srcData[src_i];
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            Mat srcBlob = *inputs[i];
+            int dims = srcBlob.dims;
+            std::vector<int> inputShape(srcBlob.size.p, srcBlob.size.p + dims);
+            bool channelsReduced = dims > (int)outShapes[i].size() ||
+            (dims == 4 && inputShape[1] > outShapes[i][1]);
+            bool performReordering = enableReordering && dims == 4 && channelsReduced;
+
+            if (performReordering)
+            {
+                Mat reordered_blob(inputShape, srcBlob.type());
+
+                float *dstData = reordered_blob.ptr<float>();
+                const float *srcData = srcBlob.ptr<float>();
+
+                int num = inputShape[0], channels = inputShape[1], height = inputShape[2], width = inputShape[3];
+                int total = num*channels*height*width;
+                for(int i_n = 0; i_n < num; i_n++) {
+                    for(int i_c = 0; i_c < channels; i_c++) {
+                        for(int i_h = 0; i_h < height; i_h++) {
+                            for(int i_w = 0; i_w < width; i_w++) {
+                                int src_i = channels*height*width*i_n + height*width*i_c + width*i_h + i_w;
+                                int dst_i = channels*height*width*i_n + i_c + channels*width*i_h + channels*i_w;
+
+                                CV_Assert(dst_i < total);
+                                CV_Assert(src_i < total);
+
+                                dstData[dst_i] = srcData[src_i];
+                            }
                         }
                     }
                 }
+
+                srcBlob = reordered_blob;
             }
 
-            srcBlob = reordered_blob;
+            // TODO: we should not assign srcBlob if performReordering is true.
+            outputs[i] = srcBlob.reshape(1, outShapes[i]);
         }
-
-        outputs[i].shareFrom(srcBlob);
-        outputs[i].reshape(outShapes[i]);
     }
-}
 
-Ptr<ReshapeLayer> ReshapeLayer::create(const BlobShape &newShape, Range applyingRange /*= Range::all()*/,
-                                       bool enableReordering /*= false*/)
+    std::vector<std::vector<int> > outShapes;
+    bool enableReordering;
+};
+
+Ptr<ReshapeLayer> ReshapeLayer::create(const LayerParams& params)
 {
-    return Ptr<ReshapeLayer>(new ReshapeLayerImpl(newShape, applyingRange, enableReordering));
+    return Ptr<ReshapeLayer>(new ReshapeLayerImpl(params));
 }
 
 
diff --git a/modules/dnn/src/layers/reshape_layer.hpp b/modules/dnn/src/layers/reshape_layer.hpp
deleted file mode 100644
index 10718b838f4..00000000000
--- a/modules/dnn/src/layers/reshape_layer.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_RESHAPE_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_RESHAPE_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class ReshapeLayerImpl : public ReshapeLayer
-{
-    std::vector<BlobShape> outShapes;
-    bool enableReordering;
-
-public:
-    ReshapeLayerImpl(const BlobShape &newShape_, Range applyingRange_, bool enableReordering_);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-Ptr<Layer> createFlattenLayer(LayerParams&);
-
-}
-}
-
-#endif
diff --git a/modules/dnn/src/layers/scale_layer.cpp b/modules/dnn/src/layers/scale_layer.cpp
index 952856002ef..b465be7c9e3 100644
--- a/modules/dnn/src/layers/scale_layer.cpp
+++ b/modules/dnn/src/layers/scale_layer.cpp
@@ -9,51 +9,70 @@
 Implementation of Scale layer.
 */
 
-#include "scale_layer.hpp"
+#include "../precomp.hpp"
+#include "layers_common.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
 {
 namespace dnn
 {
 
-void ScaleLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+class ScaleLayerImpl : public ScaleLayer
 {
-    CV_Assert(blobs.size() == 1 + hasBias);
-
-    outputs.resize(inputs.size());
-    for (size_t i = 0; i < inputs.size(); i++)
+public:
+    ScaleLayerImpl(const LayerParams& params)
     {
-        outputs[i].create(inputs[i]->shape());
+        setParamsFrom(params);
+        hasBias = params.get<bool>("bias_term", false);
     }
-}
 
-void ScaleLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 1);
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(blobs.size() == 1 + hasBias);
 
-    Blob &inpBlob = *inputs[0];
+        outputs.resize(inputs.size());
+        for (size_t i = 0; i < inputs.size(); i++)
+        {
+            const Mat& inp = *inputs[i];
+            outputs[i].create(inp.dims, inp.size.p, inp.type());
+        }
+    }
 
-    for (size_t ii = 0; ii < outputs.size(); ii++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-      Blob &outBlob = outputs[ii];
+        for (size_t ii = 0; ii < outputs.size(); ii++)
+        {
+            Mat &inpBlob = *inputs[ii];
+            Mat &outBlob = outputs[ii];
 
-      CV_Assert(inpBlob.channels() == blobs[0].total());
+            CV_Assert(inpBlob.size[1] == blobs[0].total());
+            if (hasBias)
+                CV_Assert(inpBlob.size[1] == blobs[1].total());
 
-      if (hasBias)
-        CV_Assert(inpBlob.channels() == blobs[1].total());
+            CV_Assert(inpBlob.type() == CV_32F && outBlob.type() == CV_32F);
 
-      for (int n = 0; n < inpBlob.channels(); n++)
-      {
-          float w = blobs[0].matRefConst().at<float>(n);
-          float b = hasBias ? blobs[1].matRefConst().at<float>(n) : 0;
-          outBlob.getPlane(0, n) = w*inpBlob.getPlane(0, n) + b;
-      }
+            for( int cn = 0; cn < inpBlob.size[0]; cn++ )
+            {
+                for (int n = 0; n < inpBlob.size[1]; n++)
+                {
+                    float w = blobs[0].at<float>(n);
+                    float b = hasBias ? blobs[1].at<float>(n) : 0;
+                    Mat outBlobPlane = getPlane(outBlob, cn, n);
+                    Mat inpBlobPlane = getPlane(inpBlob, cn, n);
+                    inpBlobPlane.convertTo(outBlobPlane, CV_32F, w, b);
+                }
+            }
+        }
     }
-}
 
-Ptr<ScaleLayer> ScaleLayer::create(bool hasBias)
+    bool hasBias;
+};
+
+
+Ptr<ScaleLayer> ScaleLayer::create(const LayerParams& params)
 {
-    return Ptr<ScaleLayer>(new ScaleLayerImpl(hasBias));
+    return Ptr<ScaleLayer>(new ScaleLayerImpl(params));
 }
 
 }  // namespace dnn
diff --git a/modules/dnn/src/layers/scale_layer.hpp b/modules/dnn/src/layers/scale_layer.hpp
deleted file mode 100644
index ee58af4326e..00000000000
--- a/modules/dnn/src/layers/scale_layer.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2016, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-/*
-Declaration of scale layer, which multiplies and shifts channels in input blob.
-*/
-
-
-#ifndef __OPENCV_DNN_LAYERS_SCALE_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_SCALE_LAYER_HPP__
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class ScaleLayerImpl : public ScaleLayer
-{
-public:
-    ScaleLayerImpl(bool hasBias_): hasBias(hasBias_) {}
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    bool hasBias;
-};
-
-}
-}
-
-#endif // __OPENCV_DNN_LAYERS_SCALE_LAYER_HPP__
diff --git a/modules/dnn/src/layers/shift_layer.cpp b/modules/dnn/src/layers/shift_layer.cpp
index 6663640af20..03ba84b10e2 100644
--- a/modules/dnn/src/layers/shift_layer.cpp
+++ b/modules/dnn/src/layers/shift_layer.cpp
@@ -10,7 +10,6 @@ Implementation of shift layer, which adds up const values to blob.
 */
 
 #include "../precomp.hpp"
-#include "shift_layer.hpp"
 #include "op_blas.hpp"
 
 namespace cv
@@ -18,139 +17,99 @@ namespace cv
 namespace dnn
 {
 
-class ShiftLayerImpl {
+class ShiftLayerImpl : public ShiftLayer
+{
 public:
-    static Ptr<ShiftLayerImpl> create(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs,
-                                          const std::vector<Blob>& blobs);
-
-    virtual ~ShiftLayerImpl() {}
-
-    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs, const std::vector<Blob>& blobs) = 0;
-
-protected:
-    ShiftLayerImpl() {}
-    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs, const std::vector<Blob>& blobs) = 0;
-};
-
-namespace {
+    ShiftLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        CV_Assert(blobs.size() == 1);
 
-class ShiftChannelsLayerImpl : public ShiftLayerImpl {
-public:
-    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs, const std::vector<Blob>& blobs) {
-        for (size_t ii = 0; ii < outputs.size(); ii++)
+#ifdef HAVE_LAPACK
         {
-            Blob &inpBlob = *inputs[ii];
-            Blob &outBlob = outputs[ii];
-
-            inpBlob.matRef().copyTo(outBlob.matRef());
-
-            for (int n = 0; n < inpBlob.num(); n++)
+            if (getBlasThreads() != cv::getThreadNum())
             {
-                Mat dstMat(inpBlob.channels(), inpBlob.rows() * inpBlob.cols(),
-                           outBlob.type(), outBlob.ptr(n));
-                dnn::gemm(blobs[0].matRefConst(), biasOnesMat, 1, dstMat, 1); //TODO: gemv
+                setBlasThreads(cv::getThreadNum());
             }
         }
+#endif
     }
 
-protected:
-    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs, const std::vector<Blob>& blobs) {
+    virtual void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
         CV_Assert(inputs.size() > 0);
-
-        const Blob &inpBlob = *inputs[0];
-        CV_Assert(inpBlob.dims() == 4 && inpBlob.type() == CV_32F);
-        const Blob &biasBlob = blobs[0];
-        CV_Assert(biasBlob.total() == (size_t)inpBlob.channels());
-
+        CV_Assert(blobs.size() > 0);
+        const Mat &inpBlob = *inputs[0];
+        CV_Assert(inpBlob.dims == 4 && inpBlob.type() == CV_32F);
+        const Mat &biasBlob = blobs[0];
         outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
+
+        if(inpBlob.dims == biasBlob.dims)
         {
-            CV_Assert(inputs[i]->type() == inpBlob.type());
-            CV_Assert(inputs[i]->dims() == 4 && inputs[i]->channels() == inpBlob.channels());
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                CV_Assert(inputs[i]->type() == inpBlob.type());
+                CV_Assert(inputs[i]->dims == inpBlob.dims);
 
-            outputs[i].shareFrom(*inputs[i]);
+                outputs[i] = *inputs[i];
+            }
         }
+        else
+        {
+            CV_Assert(biasBlob.total() == (size_t)inpBlob.size[1]);
 
-        biasOnesMat = Mat::ones(1, inpBlob.rows() * inpBlob.cols(), inpBlob.type());
-    }
-
-private:
-    Mat biasOnesMat;
-};
-
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                CV_Assert(inputs[i]->type() == inpBlob.type());
+                CV_Assert(inputs[i]->dims == 4 && inputs[i]->size[1] == inpBlob.size[1]);
 
-class ShiftElementsLayerImpl : public ShiftLayerImpl {
-public:
-    virtual void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs, const std::vector<Blob>& blobs) {
-        for (size_t ii = 0; ii < outputs.size(); ii++)
-        {
-          Blob &inpBlob = *inputs[ii];
-          Blob &outBlob = outputs[ii];
+                outputs[i] = *inputs[i];
+            }
 
-          outBlob.matRef() = inpBlob.matRef() + blobs[0].matRefConst();
+            biasOnesMat = Mat::ones(1, inpBlob.size[2] * inpBlob.size[3], inpBlob.type());
         }
     }
 
-protected:
-    virtual void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs, const std::vector<Blob>& blobs) {
+    virtual void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
         CV_Assert(inputs.size() > 0);
+        CV_Assert(blobs.size() > 0);
 
-        const Blob &inpBlob = *inputs[0];
-        CV_Assert(inpBlob.type() == CV_32F);
-        const Blob &biasBlob = blobs[0];
-        CV_Assert(biasBlob.dims() == inpBlob.dims());
-
-        outputs.resize(inputs.size());
-        for (size_t i = 0; i < inputs.size(); i++)
+        if(inputs[0]->dims == blobs[0].dims)
         {
-            CV_Assert(inputs[i]->type() == inpBlob.type());
-            CV_Assert(inputs[i]->dims() == inpBlob.dims());
+            for (size_t ii = 0; ii < outputs.size(); ii++)
+            {
+                Mat &inpBlob = *inputs[ii];
+                Mat &outBlob = outputs[ii];
 
-            outputs[i].shareFrom(*inputs[i]);
+                outBlob = inpBlob + blobs[0];
+            }
         }
-    }
-};
-
-}
-
-Ptr<ShiftLayerImpl> ShiftLayerImpl::create(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs,
-                                      const std::vector<Blob>& blobs) {
-    Ptr<ShiftLayerImpl> impl;
-
-    CV_Assert(inputs.size() > 0);
-    CV_Assert(blobs.size() > 0);
-
-    if(inputs[0]->dims() == blobs[0].dims())
-        impl = Ptr<ShiftLayerImpl>(new ShiftElementsLayerImpl);
-    else
-        impl = Ptr<ShiftLayerImpl>(new ShiftChannelsLayerImpl);
-
-    impl->allocate(inputs, outputs, blobs);
-    return impl;
-}
+        else
+        {
+            for (size_t ii = 0; ii < outputs.size(); ii++)
+            {
+                Mat &inpBlob = *inputs[ii];
+                Mat &outBlob = outputs[ii];
 
-ShiftLayer::ShiftLayer(LayerParams &params) : Layer(params)
-{
-    CV_Assert(blobs.size() == 1);
+                inpBlob.copyTo(outBlob);
 
-    #ifdef HAVE_LAPACK
-    {
-        if (getBlasThreads() != cv::getThreadNum())
-        {
-            setBlasThreads(cv::getThreadNum());
+                for (int n = 0; n < inpBlob.size[0]; n++)
+                {
+                    Mat dstMat(inpBlob.size[1], inpBlob.size[2] * inpBlob.size[3],
+                               outBlob.type(), outBlob.ptr(n));
+                    dnn::gemm(blobs[0], biasOnesMat, 1, dstMat, 1); //TODO: gemv
+                }
+            }
         }
     }
-    #endif
-}
 
-void ShiftLayer::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    impl = ShiftLayerImpl::create(inputs, outputs, blobs);
-}
+    Mat biasOnesMat;
+};
 
-void ShiftLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
+Ptr<ShiftLayer> ShiftLayer::create(const LayerParams& params)
 {
-    impl->forward(inputs, outputs, blobs);
+    return Ptr<ShiftLayer>(new ShiftLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/shift_layer.hpp b/modules/dnn/src/layers/shift_layer.hpp
deleted file mode 100644
index 36808ffbf64..00000000000
--- a/modules/dnn/src/layers/shift_layer.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// This file is part of OpenCV project.
-// It is subject to the license terms in the LICENSE file found in the top-level directory
-// of this distribution and at http://opencv.org/license.html.
-
-// Copyright (C) 2016, Intel Corporation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-
-/*
-Declaration of shift layer, which adds up const values to blob.
-*/
-
-#ifndef __OPENCV_DNN_LAYERS_SHIFT_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_SHIFT_LAYER_HPP__
-#include "../precomp.hpp"
-
-namespace cv
-{
-namespace dnn
-{
-
-class ShiftLayerImpl;
-
-class ShiftLayer : public Layer
-{
-public:
-    ShiftLayer() {}
-    ShiftLayer(LayerParams &params);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-private:
-    cv::Ptr<ShiftLayerImpl> impl;
-
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/slice_layer.cpp b/modules/dnn/src/layers/slice_layer.cpp
index 01dc27fe447..d2d1643d40e 100644
--- a/modules/dnn/src/layers/slice_layer.cpp
+++ b/modules/dnn/src/layers/slice_layer.cpp
@@ -41,8 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "slice_layer.hpp"
-#include <opencv2/core/ocl.hpp>
 #include <opencv2/dnn/shape_utils.hpp>
 
 namespace cv
@@ -50,97 +48,89 @@ namespace cv
 namespace dnn
 {
 
-SliceLayerImpl::SliceLayerImpl(int axis_ /*= 1*/)
+class SliceLayerImpl : public SliceLayer
 {
-    axis = axis_;
-}
-
-SliceLayerImpl::SliceLayerImpl(int axis_, const std::vector<int> &sliceIndices_)
-{
-    axis = axis_;
-    sliceIndices = sliceIndices_;
-}
-
-void SliceLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 1);
-
-    const Blob &inpBlob = *inputs[0];
-    useOpenCL = ocl::useOpenCL() && inpBlob.getState() == Blob::HEAD_AT_UMAT;
+public:
+    SliceLayerImpl(const LayerParams& params)
+    {
+        setParamsFrom(params);
+        axis = params.get<int>("axis", 1);
 
-    axisIdx = inpBlob.canonicalAxis(axis);
-    int axisSize = inpBlob.size(axisIdx);
-    BlobShape inpShape = inpBlob.shape();
-    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
+        if (params.has("slice_point"))
+        {
+            const DictValue &indicesValue = params.get("slice_point");
+            int i, n = indicesValue.size();
+            sliceIndices.resize(n);
+            for (i = 0; i < n; i++)
+                sliceIndices[i] = indicesValue.get<int>(i);
+        }
+    }
 
-    if (sliceIndices.size()) //divide blob with respect to passed parameters
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        std::vector<int> outAxisSize;
-        int prevSlice = 0;
+        CV_Assert(inputs.size() == 1);
+        const Mat &inpBlob = *inputs[0];
+        int dims = inpBlob.dims;
 
-        for (size_t i = 0; i < sliceIndices.size(); i++)
-        {
-            if (!(prevSlice < sliceIndices[i] && sliceIndices[i] < axisSize))
-                CV_Error(Error::StsBadArg, "Slice indices should be positive, increased and don't exceed size of sliced dimension");
+        axisIdx = axis < 0 ? axis + dims : axis;
+        int axisSize = inpBlob.size[axisIdx];
+        std::vector<int> inpShape(inpBlob.size.p, inpBlob.size.p + dims);
 
-            outAxisSize.push_back(sliceIndices[i] - prevSlice);
-            prevSlice = sliceIndices[i];
+        if (sliceIndices.size()) //divide blob with respect to passed parameters
+        {
+            std::vector<int> outAxisSize;
+            int prevSlice = 0;
+
+            for (size_t i = 0; i < sliceIndices.size(); i++)
+            {
+                if (!(prevSlice < sliceIndices[i] && sliceIndices[i] < axisSize))
+                    CV_Error(Error::StsBadArg, "Slice indices should be positive, increased and don't exceed size of sliced dimension");
+
+                outAxisSize.push_back(sliceIndices[i] - prevSlice);
+                prevSlice = sliceIndices[i];
+            }
+            outAxisSize.push_back(axisSize - prevSlice);
+
+            outputs.resize(outAxisSize.size());
+            for (size_t i = 0; i < outAxisSize.size(); i++)
+            {
+                inpShape[axisIdx] = outAxisSize[i];
+                outputs[i].create(inpShape, inpBlob.type());
+            }
         }
-        outAxisSize.push_back(axisSize - prevSlice);
-
-        outputs.resize(outAxisSize.size());
-        for (size_t i = 0; i < outAxisSize.size(); i++)
+        else //divide blob with respect to count of output blobs
         {
-            inpShape[axisIdx] = outAxisSize[i];
-            outputs[i].create(inpShape, inpBlob.type(), allocFlags);
+            CV_Assert(outputs.size() > 0 && axisSize % outputs.size() == 0);
+            int outAxisSize = axisSize / (int)outputs.size();
+
+            for (size_t i = 0; i < outputs.size(); i++)
+            {
+                inpShape[axisIdx] = outAxisSize;
+                outputs[i].create(inpShape, inpBlob.type());
+            }
         }
     }
-    else //divide blob with respect to count of output blobs
+
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        CV_Assert(outputs.size() > 0 && axisSize % outputs.size() == 0);
-        int outAxisSize = axisSize / (int)outputs.size();
+        const Mat& inpMat = *inputs[0];
+        std::vector<Range> ranges(inpMat.dims, Range::all());
 
+        ranges[axisIdx].start = 0;
         for (size_t i = 0; i < outputs.size(); i++)
         {
-            inpShape[axisIdx] = outAxisSize;
-            outputs[i].create(inpShape, inpBlob.type(), allocFlags);
+            ranges[axisIdx].end = ranges[axisIdx].start + outputs[i].size[axisIdx];
+            inpMat(&ranges[0]).copyTo(outputs[i]);
+            ranges[axisIdx].start = ranges[axisIdx].end;
         }
     }
-}
-
-void SliceLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    #ifdef HAVE_OPENCL
-    if (useOpenCL)
-        forward_<UMat>(inputs, outputs);
-    else
-    #endif
-        forward_<Mat>(inputs, outputs);
-}
-
-template<typename XMat>
-void SliceLayerImpl::forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    const XMat& inpMat = inputs[0]->getRefConst<XMat>();
-    std::vector<Range> ranges(inputs[0]->dims(), Range::all());
 
-    ranges[axisIdx].start = 0;
-    for (size_t i = 0; i < outputs.size(); i++)
-    {
-        ranges[axisIdx].end = ranges[axisIdx].start + outputs[i].size(axisIdx);
-        inpMat(&ranges[0]).copyTo(outputs[i].getRef<XMat>());
-        ranges[axisIdx].start = ranges[axisIdx].end;
-    }
-}
-
-Ptr<SliceLayer> SliceLayer::create(int axis)
-{
-    return Ptr<SliceLayer>(new SliceLayerImpl(axis));
-}
+    int axisIdx;
+};
 
-Ptr<SliceLayer> SliceLayer::create(int axis, const std::vector<int> &sliceIndices)
+Ptr<SliceLayer> SliceLayer::create(const LayerParams& params)
 {
-    return Ptr<SliceLayer>(new SliceLayerImpl(axis, sliceIndices));
+    return Ptr<SliceLayer>(new SliceLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/slice_layer.hpp b/modules/dnn/src/layers/slice_layer.hpp
deleted file mode 100644
index 4f7cbb37c2d..00000000000
--- a/modules/dnn/src/layers/slice_layer.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_SLICE_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_SLICE_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class SliceLayerImpl : public SliceLayer
-{
-    bool useOpenCL;
-    int axisIdx;
-
-    template<typename XMat>
-    void forward_(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-public:
-    SliceLayerImpl(int axis_ = 1);
-    SliceLayerImpl(int axis_, const std::vector<int> &sliceIndices_);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/softmax_layer.cpp b/modules/dnn/src/layers/softmax_layer.cpp
index 54751bc7cd6..5f7b4e4c91f 100644
--- a/modules/dnn/src/layers/softmax_layer.cpp
+++ b/modules/dnn/src/layers/softmax_layer.cpp
@@ -41,9 +41,6 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "softmax_layer.hpp"
-#include <opencv2/core/ocl.hpp>
-#include "opencl_kernels_dnn.hpp"
 #include <algorithm>
 #include <stdlib.h>
 using std::max;
@@ -53,171 +50,112 @@ namespace cv
 namespace dnn
 {
 
-SoftMaxLayerImpl::SoftMaxLayerImpl(int axis)
+class SoftMaxLayerImpl : public SoftmaxLayer
 {
-    axisRaw = axis;
-}
-
-void SoftMaxLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 1);
-    axis = inputs[0]->canonicalAxis(axisRaw);
-
-    useOpenCL = ocl::useOpenCL();
-
-    BlobShape shape = inputs[0]->shape();
-    outerSize = shape.total(0, axis);
-    channels = shape[axis];
-    innerSize = shape.total(axis + 1);
-
-    int allocFlag = (useOpenCL) ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
-    shape[axis] = 1;
-    buf.create(shape, inputs[0]->type(), allocFlag);
+public:
 
-    outputs.resize(1);
-    outputs[0].create(inputs[0]->shape(), inputs[0]->type(), allocFlag);
-}
-
-void SoftMaxLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    Blob &src = *inputs[0];
-    Blob &dst = outputs[0];
-
-    if (!useOpenCL)
-        forward_cpu(src, dst);
-    else
+    SoftMaxLayerImpl(const LayerParams& params)
     {
-        CV_Assert(forward_ocl(src, dst));
+        axisRaw = params.get<int>("axis", 1);
+        setParamsFrom(params);
     }
-}
 
-#ifdef HAVE_OPENCL
-bool SoftMaxLayerImpl::forward_ocl(Blob &src, Blob &dst)
-{
-    const UMat &srcMat = src.umatRefConst();
-    UMat &dstMat = dst.umatRef();
-    srcMat.copyTo(dstMat);
-    UMat &bufMat = buf.umatRef();
-    CV_Assert(dstMat.offset == 0);
-
-    String buildOpts = String("-DT=") + ocl::typeToStr(src.type());
-    ocl::Kernel kmax, ksub, ksum, kdiv;
-
-    if (!kmax.create("kernel_channel_max", ocl::dnn::softmax_oclsrc, buildOpts))
-        return false;
-
-    if (!ksub.create("kernel_channel_subtract", ocl::dnn::softmax_oclsrc, buildOpts))
-        return false;
-
-    if (!ksum.create("kernel_channel_sum", ocl::dnn::softmax_oclsrc, buildOpts))
-        return false;
-
-    if (!kdiv.create("kernel_channel_div", ocl::dnn::softmax_oclsrc, buildOpts))
-        return false;
-
-    size_t wgSize = ocl::Device::getDefault().maxWorkGroupSize();
-    size_t bufSize = buf.total();
-    size_t totalSize = src.total();
-
-    kmax.args((int)outerSize, (int)channels, (int)innerSize,
-              ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-    if (!kmax.run(1, &bufSize, &wgSize, true))
-        return false;
-
-    ksub.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
-              ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-    if (!ksub.run(1, &totalSize, &wgSize, true))
-        return false;
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() == 1);
+        const Mat& inp0 = *inputs[0];
+        int dims = inp0.dims;
+        axis = axisRaw < 0 ? axisRaw + dims : axisRaw;
 
-    cv::exp(dstMat, dstMat);
+        outerSize = inp0.total(0, axis);
+        channels = inp0.size[axis];
+        innerSize = inp0.total(axis + 1);
 
-    ksum.args((int)outerSize, (int)channels, (int)innerSize,
-              ocl::KernelArg::PtrReadOnly(dstMat), ocl::KernelArg::PtrReadWrite(bufMat));
-    if (!ksum.run(1, &bufSize, &wgSize, true))
-        return false;
+        std::vector<int> shape(inp0.size.p, inp0.size.p + dims);
+        shape[axis] = 1;
+        buf.create(shape, inp0.type());
 
-    kdiv.args((int)totalSize, (int)outerSize, (int)channels, (int)innerSize,
-              ocl::KernelArg::PtrReadOnly(bufMat), ocl::KernelArg::PtrReadWrite(dstMat));
-    if (!kdiv.run(1, &totalSize, &wgSize, true))
-        return false;
+        outputs.resize(1);
+        outputs[0].create(inp0.dims, inp0.size.p, inp0.type());
+    }
 
-    return true;
-}
-#else
-bool SoftMaxLayerImpl::forward_ocl(Blob&, Blob&)
-{
-    return false;
-}
-#endif
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        const Mat &src = *inputs[0];
+        Mat &dst = outputs[0];
 
-void SoftMaxLayerImpl::forward_cpu(Blob &src, Blob &dst)
-{
-    CV_Assert(src.type() == CV_32F);
+        CV_Assert(src.type() == CV_32F);
+        CV_Assert(src.isContinuous() && dst.isContinuous());
 
-    float *srcPtr = src.ptrf();
-    float *dstPtr = dst.ptrf();
-    float *bufPtr = buf.ptrf();
+        const float *srcPtr = src.ptr<float>();
+        float *dstPtr = dst.ptr<float>();
+        float *bufPtr = buf.ptr<float>();
 
-    size_t outerStep = src.total(axis);
-    size_t cnStep = src.total(axis + 1);
+        size_t outerStep = src.total(axis);
+        size_t cnStep = src.total(axis + 1);
 
-    //compute max along axis
-    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-    {
-        size_t srcOffset = outerDim * outerStep;
-        size_t bufOffset = outerDim * cnStep;
+        //compute max along axis
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
+        {
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
 
-        memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
+            memcpy(bufPtr + bufOffset, srcPtr + srcOffset, innerSize * sizeof(float));
 
-        for (size_t cnDim = 1; cnDim < channels; cnDim++)
-        {
-            for (size_t i = 0; i < innerSize; i++)
-                bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
+            for (size_t cnDim = 1; cnDim < channels; cnDim++)
+            {
+                for (size_t i = 0; i < innerSize; i++)
+                    bufPtr[bufOffset + i] = std::max(bufPtr[bufOffset + i], srcPtr[srcOffset + cnDim * cnStep + i]);
+            }
         }
-    }
-
-    //subtract max
-    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-    {
-        size_t srcOffset = outerDim * outerStep;
-        size_t bufOffset = outerDim * cnStep;
 
-        for (size_t cnDim = 0; cnDim < channels; cnDim++)
+        //subtract max
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
         {
-            for (size_t i = 0; i < innerSize; i++)
-                dstPtr[srcOffset + cnDim * cnStep + i] = srcPtr[srcOffset + cnDim * cnStep + i] - bufPtr[bufOffset + i];
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
+
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                for (size_t i = 0; i < innerSize; i++)
+                    dstPtr[srcOffset + cnDim * cnStep + i] = srcPtr[srcOffset + cnDim * cnStep + i] - bufPtr[bufOffset + i];
+            }
         }
-    }
-
-    cv::exp(dst.matRef(), dst.matRef());
-
-    for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
-    {
-        size_t srcOffset = outerDim * outerStep;
-        size_t bufOffset = outerDim * cnStep;
 
-        //sum exp along axis
-        for (size_t i = 0; i < innerSize; i++)
-            bufPtr[bufOffset + i] = 0.f;
+        cv::exp(dst, dst);
 
-        for (size_t cnDim = 0; cnDim < channels; cnDim++)
+        for (size_t outerDim = 0; outerDim < outerSize; outerDim++)
         {
-            for (size_t i = 0; i < innerSize; i++)
-                bufPtr[bufOffset + i] += dstPtr[srcOffset + cnDim * cnStep + i];
-        }
+            size_t srcOffset = outerDim * outerStep;
+            size_t bufOffset = outerDim * cnStep;
 
-        //divide by computed sum
-        for (size_t cnDim = 0; cnDim < channels; cnDim++)
-        {
+            //sum exp along axis
             for (size_t i = 0; i < innerSize; i++)
-                dstPtr[srcOffset + cnDim * cnStep + i] /= bufPtr[bufOffset + i];
+                bufPtr[bufOffset + i] = 0.f;
+
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                for (size_t i = 0; i < innerSize; i++)
+                    bufPtr[bufOffset + i] += dstPtr[srcOffset + cnDim * cnStep + i];
+            }
+
+            //divide by computed sum
+            for (size_t cnDim = 0; cnDim < channels; cnDim++)
+            {
+                for (size_t i = 0; i < innerSize; i++)
+                    dstPtr[srcOffset + cnDim * cnStep + i] /= bufPtr[bufOffset + i];
+            }
         }
     }
-}
 
-Ptr<SoftmaxLayer> SoftmaxLayer::create(int axis)
+    int axis, axisRaw;
+    Mat buf;
+    size_t outerSize, channels, innerSize;
+};
+
+Ptr<SoftmaxLayer> SoftmaxLayer::create(const LayerParams& params)
 {
-    return Ptr<SoftmaxLayer>(new SoftMaxLayerImpl(axis));
+    return Ptr<SoftmaxLayer>(new SoftMaxLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/softmax_layer.hpp b/modules/dnn/src/layers/softmax_layer.hpp
deleted file mode 100644
index fad97dddc8c..00000000000
--- a/modules/dnn/src/layers/softmax_layer.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_SOFTMAX_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_SOFTMAX_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class SoftMaxLayerImpl : public SoftmaxLayer
-{
-    int axis, axisRaw;
-    Blob buf;
-    bool useOpenCL;
-    size_t outerSize, channels, innerSize;
-
-
-    bool forward_ocl(Blob &src, Blob &dst);
-    void forward_cpu(Blob &src, Blob &dst);
-
-public:
-
-    SoftMaxLayerImpl(int axis = 1);
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/layers/split_layer.cpp b/modules/dnn/src/layers/split_layer.cpp
index cd3a507443b..ac6b39c8e1e 100644
--- a/modules/dnn/src/layers/split_layer.cpp
+++ b/modules/dnn/src/layers/split_layer.cpp
@@ -41,47 +41,54 @@
 
 #include "../precomp.hpp"
 #include "layers_common.hpp"
-#include "split_layer.hpp"
-#include <opencv2/core/ocl.hpp>
 
 namespace cv
 {
 namespace dnn
 {
 
-SplitLayerImpl::SplitLayerImpl(int outputsCount_ /*= -1*/)
+class SplitLayerImpl : public SplitLayer
 {
-    outputsCount = outputsCount_;
-}
+public:
+    SplitLayerImpl(const LayerParams &params)
+    {
+        setParamsFrom(params);
+        //TODO: maybe "top_count" param is useless because it can be determined by output connections number
+        if (params.has("top_count"))
+        {
+            outputsCount = params.get<int>("top_count");
+            CV_Assert(outputsCount >= 0);
+        }
+        else
+        {
+            outputsCount = -1;
+        }
+    }
 
-void SplitLayerImpl::allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    CV_Assert(inputs.size() == 1);
-    useOpenCL = ocl::useOpenCL() && inputs[0]->getState() == Blob::HEAD_AT_UMAT;
-    int allocFlags = useOpenCL ? Blob::ALLOC_UMAT : Blob::ALLOC_MAT;
+    void allocate(const std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
+    {
+        CV_Assert(inputs.size() == 1);
+        const Mat& inp0 = *inputs[0];
 
-    if (outputsCount >= 0)
-        outputs.resize(outputsCount);
+        if (outputsCount >= 0)
+            outputs.resize(outputsCount);
 
-    for (size_t i = 0; i < outputs.size(); i++)
-        outputs[i].create(inputs[0]->shape(), inputs[0]->type(), allocFlags);
-}
+        for (size_t i = 0; i < outputs.size(); i++)
+            outputs[i].create(inp0.dims, inp0.size.p, inp0.type());
+    }
 
-void SplitLayerImpl::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
-{
-    for (size_t i = 0; i < outputs.size(); i++)
+    void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs)
     {
-        if (useOpenCL)
-            inputs[0]->umatRefConst().copyTo(outputs[i].umatRef());
-        else
-            inputs[0]->matRefConst().copyTo(outputs[i].matRef());
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            inputs[0]->copyTo(outputs[i]);
+        }
     }
-}
-
+};
 
-Ptr<SplitLayer> SplitLayer::create(int outputsCount)
+Ptr<SplitLayer> SplitLayer::create(const LayerParams& params)
 {
-    return Ptr<SplitLayer>(new SplitLayerImpl(outputsCount));
+    return Ptr<SplitLayer>(new SplitLayerImpl(params));
 }
 
 }
diff --git a/modules/dnn/src/layers/split_layer.hpp b/modules/dnn/src/layers/split_layer.hpp
deleted file mode 100644
index 124cb1275b5..00000000000
--- a/modules/dnn/src/layers/split_layer.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                           License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#ifndef __OPENCV_DNN_LAYERS_SPLIT_LAYER_HPP__
-#define __OPENCV_DNN_LAYERS_SPLIT_LAYER_HPP__
-#include "../precomp.hpp"
-#include <opencv2/dnn/all_layers.hpp>
-
-namespace cv
-{
-namespace dnn
-{
-
-class SplitLayerImpl : public SplitLayer
-{
-    bool useOpenCL;
-
-public:
-    SplitLayerImpl(int outputsCount_ = -1);
-
-    void allocate(const std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-
-    void forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs);
-};
-
-}
-}
-#endif
diff --git a/modules/dnn/src/precomp.hpp b/modules/dnn/src/precomp.hpp
index 6932bc8e5bc..b622e38da2e 100644
--- a/modules/dnn/src/precomp.hpp
+++ b/modules/dnn/src/precomp.hpp
@@ -42,3 +42,4 @@
 #include <opencv2/core.hpp>
 #include "cvconfig.h"
 #include <opencv2/dnn.hpp>
+#include <opencv2/dnn/all_layers.hpp>
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
index b6aa3682216..84460e60477 100644
--- a/modules/dnn/src/tensorflow/tf_importer.cpp
+++ b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -51,31 +51,32 @@ struct Pin
     int blobIndex;
 };
 
-BlobShape blobShapeFromTensor(const tensorflow::TensorProto &tensor)
+void blobShapeFromTensor(const tensorflow::TensorProto &tensor, std::vector<int>& shape)
 {
+    shape.clear();
     if (tensor.has_tensor_shape())
     {
         const tensorflow::TensorShapeProto &_shape = tensor.tensor_shape();
-        BlobShape shape = BlobShape::all(_shape.dim_size());
+        int i, n = _shape.dim_size();
+        shape.resize(n);
 
-        for (int i = 0; i < _shape.dim_size(); i++)
+        for (i = 0; i < n; i++)
             shape[i] = (int)_shape.dim(i).size();
-
-        return shape;
     }
     else
     {
         CV_Error(Error::StsError, "Unknown shape of input tensor");
-        return BlobShape();
     }
 }
 
 template <typename T>
-void parseTensor(const tensorflow::TensorProto &tensor, Blob &dstBlob)
+void parseTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
 {
-    BlobShape shape = blobShapeFromTensor(tensor);
+    std::vector<int> shape;
+    blobShapeFromTensor(tensor, shape);
+    int dims = (int)shape.size();
 
-    if (shape.dims() == 4)
+    if (dims == 4)
     {
         // REORDER blob NHWC to NCHW
         swap(shape[2], shape[3]); // NHCW
@@ -85,12 +86,12 @@ void parseTensor(const tensorflow::TensorProto &tensor, Blob &dstBlob)
     dstBlob.create(shape, CV_32F);
 
     int size = tensor.tensor_content().size() / sizeof(T);
-    CV_Assert(size == (int)dstBlob.matRefConst().total());
+    CV_Assert(size == (int)dstBlob.total());
 
-    float *dstData = dstBlob.matRef().ptr<float>();
+    float *dstData = dstBlob.ptr<float>();
     const T *data = reinterpret_cast<const T*>(tensor.tensor_content().c_str());
 
-    if (shape.dims() == 4)
+    if (dims == 4)
     {
         int num = shape[0], channels = shape[1], height = shape[2], width = shape[3];
         int total = num*channels*height*width;
@@ -115,7 +116,7 @@ void parseTensor(const tensorflow::TensorProto &tensor, Blob &dstBlob)
     }
 }
 
-void blobFromTensor(const tensorflow::TensorProto &tensor, Blob &dstBlob)
+void blobFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
 {
     switch (tensor.dtype()) {
         case tensorflow::DT_FLOAT:
@@ -235,10 +236,12 @@ void setStrides(LayerParams &layerParams, const tensorflow::NodeDef &layer)
 }
 
 DictValue parseDims(const tensorflow::TensorProto &tensor) {
-    BlobShape shape = blobShapeFromTensor(tensor);
+    std::vector<int> shape;
+    blobShapeFromTensor(tensor, shape);
+    int dims = (int)shape.size();
 
     CV_Assert(tensor.dtype() == tensorflow::DT_INT32);
-    CV_Assert(shape.dims() == 1);
+    CV_Assert(dims == 1);
 
     int size = tensor.tensor_content().size() / sizeof(int);
     const int *data = reinterpret_cast<const int*>(tensor.tensor_content().c_str());
@@ -372,7 +375,7 @@ class TFImporter : public Importer {
     ~TFImporter() {}
 
 private:
-    void kernelFromTensor(const tensorflow::TensorProto &tensor, Blob &dstBlob);
+    void kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob);
 
     void connect(const std::map<String, int>& layers_name_id_map, Net& network, const Pin& outPin,
                  const int input_layer_id, const int input_blob_id);
@@ -391,13 +394,15 @@ TFImporter::TFImporter(const char *model)
         ReadTFNetParamsFromBinaryFileOrDie(model, &net);
 }
 
-void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Blob &dstBlob)
+void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Mat &dstBlob)
 {
-    BlobShape shape = blobShapeFromTensor(tensor);
+    std::vector<int> shape;
+    blobShapeFromTensor(tensor, shape);
+    int dims = (int)shape.size();
 
     // TODO: other blob types
     CV_Assert(tensor.dtype() == tensorflow::DT_FLOAT);
-    CV_Assert(shape.dims() == 4);
+    CV_Assert(dims == 4);
 
     // REORDER kernel HWIO to OIHW
     swap(shape[0], shape[2]); // IWHO
@@ -407,9 +412,9 @@ void TFImporter::kernelFromTensor(const tensorflow::TensorProto &tensor, Blob &d
     dstBlob.create(shape, CV_32F);
 
     int size = tensor.tensor_content().size() / sizeof(float);
-    CV_Assert(size == (int)dstBlob.matRefConst().total());
+    CV_Assert(size == (int)dstBlob.total());
 
-    float *dstData = dstBlob.matRef().ptr<float>();
+    float *dstData = dstBlob.ptr<float>();
     const float *data = reinterpret_cast<const float*>(tensor.tensor_content().c_str());
 
     int out_c = shape[0], input_c = shape[1], height = shape[2], width = shape[3];
@@ -533,7 +538,7 @@ void TFImporter::populateNet(Net dstNet)
             }
 
             kernelFromTensor(getConstBlob(layer, value_id), layerParams.blobs[0]);
-            BlobShape kshape = layerParams.blobs[0].shape();
+            const int* kshape = layerParams.blobs[0].size.p;
             layerParams.set("kernel_h", kshape[2]);
             layerParams.set("kernel_w", kshape[3]);
             layerParams.set("num_output", kshape[0]);
@@ -588,13 +593,11 @@ void TFImporter::populateNet(Net dstNet)
             blobFromTensor(getConstBlob(layer, value_id, -1, &kernel_blob_index), layerParams.blobs[0]);
 
             if (kernel_blob_index == 1) { // In this case output is computed by x*W formula - W should be transposed
-                Mat data = layerParams.blobs[0].matRef().t();
-                BlobShape shape(data.rows, data.cols);
-                layerParams.blobs[0].fill(shape, layerParams.blobs[0].type(), data.data);
+                Mat data = layerParams.blobs[0].t();
+                layerParams.blobs[0] = data.clone();
             }
 
-            BlobShape kshape = layerParams.blobs[0].shape();
-            layerParams.set("num_output", kshape[0]);
+            layerParams.set("num_output", layerParams.blobs[0].size[0]);
 
             int id = dstNet.addLayer(name, "InnerProduct", layerParams);
             layer_id[name] = id;
diff --git a/modules/dnn/src/torch/torch_importer.cpp b/modules/dnn/src/torch/torch_importer.cpp
index 738a4400e69..5bd22355acb 100644
--- a/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@@ -52,11 +52,11 @@ namespace dnn {
 #if defined(ENABLE_TORCH_IMPORTER) && ENABLE_TORCH_IMPORTER
 #include "THDiskFile.h"
 
-#ifdef NDEBUG
+//#ifdef NDEBUG
 static bool dbgPrint = false;
-#else
-static bool dbgPrint = true;
-#endif
+//#else
+//static bool dbgPrint = true;
+//#endif
 
 enum LuaType
 {
@@ -91,13 +91,13 @@ static inline bool endsWith(const String &str, const char *substr)
 
 struct TorchImporter : public ::cv::dnn::Importer
 {
-    typedef std::map<String, std::pair<int, Blob> > TensorsMap;
+    typedef std::map<String, std::pair<int, Mat> > TensorsMap;
     Net net;
 
     THFile *file;
     std::set<int> readedIndexes;
     std::map<int, Mat> storages;
-    std::map<int, Blob> tensors;
+    std::map<int, Mat> tensors;
 
     struct Module
     {
@@ -343,9 +343,9 @@ struct TorchImporter : public ::cv::dnn::Importer
             std::cout << scalarParams;
 
             std::cout << "#" << tensorParams.size() << " tensorParams:\n";
-            std::map<String,std::pair<int, Blob> >::const_iterator it;
+            std::map<String,std::pair<int, Mat> >::const_iterator it;
             for (it = tensorParams.begin(); it != tensorParams.end(); it++)
-                std::cout << it->first << ": Tensor " << it->second.second.shape() << "\n";
+                std::cout << it->first << ": Tensor " << it->second.second.size << "\n";
         }
     }
 
@@ -364,7 +364,7 @@ struct TorchImporter : public ::cv::dnn::Importer
 
         if (typeidx == TYPE_NIL)
         {
-            tensors.insert(std::make_pair(indexTensor, Blob()));
+            tensors.insert(std::make_pair(indexTensor, Mat()));
             return;
         }
 
@@ -398,9 +398,8 @@ struct TorchImporter : public ::cv::dnn::Importer
         Mat srcMat(ndims, (int*)isizes, typeTensor , storages[indexStorage].ptr() + offset*CV_ELEM_SIZE(typeTensor), (size_t*)ssteps);
         int dstType = CV_32F;
 
-        Blob blob;
-        blob.create(BlobShape(ndims, isizes), dstType);
-        srcMat.convertTo(blob.matRef(), dstType);
+        Mat blob;
+        srcMat.convertTo(blob, dstType);
 
         tensors.insert(std::make_pair(indexTensor, blob));
     }
@@ -523,7 +522,7 @@ struct TorchImporter : public ::cv::dnn::Importer
                 readTorchTable(scalarParams, tensorParams);
 
                 CV_Assert(tensorParams.count("weight"));
-                Blob weightBlob = tensorParams["weight"].second;
+                Mat weightBlob = tensorParams["weight"].second;
                 layerParams.blobs.push_back(weightBlob);
 
                 bool bias = tensorParams.count("bias") != 0;
@@ -531,7 +530,7 @@ struct TorchImporter : public ::cv::dnn::Importer
                     layerParams.blobs.push_back(tensorParams["bias"].second);
                 layerParams.set("bias_term", bias);
 
-                layerParams.set("num_output", weightBlob.size(0));
+                layerParams.set("num_output", weightBlob.size[0]);
                 curModule->modules.push_back(newModule);
             }
             else if (nnName == "Reshape")
@@ -608,7 +607,7 @@ struct TorchImporter : public ::cv::dnn::Importer
                 }
                 else {
                     CV_Assert(tensorParams["weight"].second.total() == 1);
-                    float negative_slope = *tensorParams["weight"].second.ptrf();
+                    float negative_slope = *tensorParams["weight"].second.ptr<float>();
                     layerParams.set("negative_slope", negative_slope);
 
                     newModule->apiType = "ReLU";
@@ -722,10 +721,10 @@ struct TorchImporter : public ::cv::dnn::Importer
                 layerParams.set("adj_h", static_cast<int>(scalarParams.get<double>("adjH")));
                 layerParams.set("num_output", static_cast<int>(scalarParams.get<double>("nOutputPlane")));
 
-                Blob weights = tensorParams["weight"].second;
-                BlobShape shape = weights.shape(),
-                        reorderedShape = BlobShape(shape[1], shape[0], shape[2], shape[3]);
-                layerParams.blobs.push_back(weights.reshape(reorderedShape));
+                Mat weights = tensorParams["weight"].second;
+                CV_Assert(weights.dims == 4);
+                int reorderedShape[] = { weights.size[1], weights.size[0], weights.size[2], weights.size[3] };
+                layerParams.blobs.push_back(weights.reshape(1, 4, reorderedShape));
 
                 bool bias = tensorParams.count("bias");
                 layerParams.set("bias_term", bias);
@@ -963,7 +962,7 @@ Ptr<Importer> createTorchImporter(const String &filename, bool isBinary)
 }
 
 
-Blob readTorchBlob(const String &filename, bool isBinary)
+Mat readTorchBlob(const String &filename, bool isBinary)
 {
     Ptr<TorchImporter> importer(new TorchImporter(filename, isBinary));
     importer->readObject();
diff --git a/modules/dnn/test/npy_blob.hpp b/modules/dnn/test/npy_blob.hpp
index 7c9910172a7..b04e02c239e 100644
--- a/modules/dnn/test/npy_blob.hpp
+++ b/modules/dnn/test/npy_blob.hpp
@@ -44,22 +44,91 @@
 #include "test_precomp.hpp"
 #include "cnpy.h"
 
-inline cv::dnn::Blob blobFromNPY(const cv::String &path)
+namespace cv
 {
-    cnpy::NpyArray npyBlob = cnpy::npy_load(path.c_str());
-    cv::dnn::BlobShape shape((int)npyBlob.shape.size(), (int*)&npyBlob.shape[0]);
-
-    cv::dnn::Blob blob(shape);
-    blob.fill(shape, CV_32F, npyBlob.data);
 
+inline Mat blobFromNPY(const String &path)
+{
+    cnpy::NpyArray npyBlob = cnpy::npy_load(path.c_str());
+    Mat blob = Mat((int)npyBlob.shape.size(), (int*)&npyBlob.shape[0], CV_32F, npyBlob.data).clone();
     npyBlob.destruct();
     return blob;
 }
 
-inline void saveBlobToNPY(cv::dnn::Blob &blob, const cv::String &path)
+inline void saveBlobToNPY(const Mat &blob, const String &path)
+{
+    cnpy::npy_save(path.c_str(), blob.ptr<float>(), (unsigned*)&blob.size.p[0], blob.dims);
+}
+
+inline size_t shapeTotal(const std::vector<int>& shape)
+{
+    size_t p = 1, i, n = shape.size();
+    for( i = 0; i < n; i++)
+        p *= shape[i];
+    return p;
+}
+
+inline bool shapeEqual(const std::vector<int>& shape1, const std::vector<int>& shape2)
+{
+    size_t i, n1 = shape1.size(), n2 = shape2.size();
+    if( n1 != n2 )
+        return false;
+    for( i = 0; i < n1; i++ )
+        if( shape1[i] != shape2[i] )
+            return false;
+    return true;
+}
+
+inline std::vector<int> getShape(const Mat& m)
 {
-    cv::dnn::BlobShape shape = blob.shape();
-    cnpy::npy_save(path.c_str(), blob.ptrf(), (unsigned*)&shape[0], shape.dims());
+    return m.empty() ? std::vector<int>() : std::vector<int>(&m.size.p[0], &m.size.p[0] + m.dims);
+}
+
+inline std::vector<int> makeShape(int a0, int a1=-1, int a2=-1, int a3=-1, int a4=-1, int a5=-1)
+{
+    std::vector<int> s;
+    s.push_back(a0);
+    if(a1 > 0)
+    {
+        s.push_back(a1);
+        if(a2 > 0)
+        {
+            s.push_back(a2);
+            if(a3 > 0)
+            {
+                s.push_back(a3);
+                if(a4 > 0)
+                {
+                    s.push_back(a4);
+                    if(a5 > 0)
+                        s.push_back(a5);
+                }
+            }
+        }
+    }
+    return s;
+}
+
+inline std::vector<int> concatShape(const std::vector<int>& a, const std::vector<int>& b)
+{
+    size_t na = a.size(), nb = b.size();
+    std::vector<int> c(na + nb);
+
+    std::copy(a.begin(), a.end(), c.begin());
+    std::copy(b.begin(), b.end(), c.begin() + na);
+
+    return c;
+}
+
+inline void printShape(const String& name, const std::vector<int>& shape)
+{
+    printf("%s: [", name.c_str());
+    size_t i, n = shape.size();
+    for( i = 0; i < n; i++ )
+        printf(" %d", shape[i]);
+    printf(" ]\n");
+}
+
 }
 
 #endif
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index 5d79c1a3bf0..536bda6a059 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -87,18 +87,17 @@ TEST(Reproducibility_AlexNet, Accuracy)
 
     Mat sample = imread(_tf("grace_hopper_227.png"));
     ASSERT_TRUE(!sample.empty());
-    cv::cvtColor(sample, sample, cv::COLOR_BGR2RGB);
 
     Size inputSize(227, 227);
 
     if (sample.size() != inputSize)
         resize(sample, sample, inputSize);
 
-    net.setBlob(".data", dnn::Blob::fromImages(sample));
+    net.setBlob(".data", blobFromImage(sample, 1.));
     net.forward();
 
-    Blob out = net.getBlob("prob");
-    Blob ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
+    Mat out = net.getBlob("prob");
+    Mat ref = blobFromNPY(_tf("caffe_alexnet_prob.npy"));
     normAssert(ref, out);
 }
 
@@ -120,14 +119,11 @@ TEST(Reproducibility_FCN, Accuracy)
     if (sample.size() != inputSize)
         resize(sample, sample, inputSize);
 
-    cv::cvtColor(sample, sample, cv::COLOR_BGR2RGB);
-
-    net.setBlob(".data", dnn::Blob::fromImages(sample));
+    net.setBlob(".data", blobFromImage(sample, 1.));
     net.forward();
 
-    Blob out = net.getBlob("score");
-
-    Blob ref = blobFromNPY(_tf("caffe_fcn8s_prob.npy"));
+    Mat out = net.getBlob("score");
+    Mat ref = blobFromNPY(_tf("caffe_fcn8s_prob.npy"));
     normAssert(ref, out);
 }
 
diff --git a/modules/dnn/test/test_common.hpp b/modules/dnn/test/test_common.hpp
index b9a4ee12e2d..eb38d766c6b 100644
--- a/modules/dnn/test/test_common.hpp
+++ b/modules/dnn/test/test_common.hpp
@@ -50,16 +50,10 @@ inline const std::string &getOpenCVExtraDir()
 inline void normAssert(cv::InputArray ref, cv::InputArray test, const char *comment = "")
 {
     double normL1 = cvtest::norm(ref, test, cv::NORM_L1) / ref.getMat().total();
-    EXPECT_LE(normL1, 0.0001) << comment;
+    EXPECT_LE(normL1, 0.002) << comment;
 
     double normInf = cvtest::norm(ref, test, cv::NORM_INF);
-    EXPECT_LE(normInf, 0.001) << comment;
-}
-
-inline void normAssert(cv::dnn::Blob &ref, cv::dnn::Blob &test, const char *comment = "")
-{
-    ASSERT_EQ(ref.shape(), test.shape()) << comment;
-    normAssert(ref.matRefConst(), test.matRefConst(), comment);
+    EXPECT_LE(normInf, 0.08) << comment;
 }
 
 #endif
diff --git a/modules/dnn/test/test_googlenet.cpp b/modules/dnn/test/test_googlenet.cpp
index f1fafb0077b..82f3ec1a89e 100644
--- a/modules/dnn/test/test_googlenet.cpp
+++ b/modules/dnn/test/test_googlenet.cpp
@@ -72,23 +72,17 @@ static void launchGoogleNetTest()
     inpMats.push_back( imread(_tf("googlenet_1.jpg")) );
     ASSERT_TRUE(!inpMats[0].empty() && !inpMats[1].empty());
 
-    net.setBlob(".data", Blob::fromImages(inpMats));
+    net.setBlob(".data", blobFromImages(inpMats, 1.));
     net.forward();
 
-    Blob out = net.getBlob("prob");
-    Blob ref = blobFromNPY(_tf("googlenet_prob.npy"));
+    Mat out = net.getBlob("prob");
+    Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
     normAssert(out, ref);
 }
 
 TEST(Reproducibility_GoogLeNet, Accuracy)
 {
-    OCL_OFF(launchGoogleNetTest());
-}
-
-OCL_TEST(Reproducibility_GoogLeNet, Accuracy)
-{
-    OCL_ON(launchGoogleNetTest());
-    OCL_OFF();
+    launchGoogleNetTest();
 }
 
 }
diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp
index c4544499b39..34a8ef24354 100644
--- a/modules/dnn/test/test_layers.cpp
+++ b/modules/dnn/test/test_layers.cpp
@@ -55,31 +55,33 @@ using namespace cv::dnn;
 template<typename TString>
 static String _tf(TString filename)
 {
-    return (getOpenCVExtraDir() + "/dnn/layers/") + filename;
+    String basetestdir = getOpenCVExtraDir();
+    size_t len = basetestdir.size();
+    if(len > 0 && basetestdir[len-1] != '/' && basetestdir[len-1] != '\\')
+        return (basetestdir + "/dnn/layers") + filename;
+    return (basetestdir + "dnn/layers/") + filename;
 }
 
-
-enum RunLayerMode
+void runLayer(Ptr<Layer> layer, std::vector<Mat> &inpBlobs, std::vector<Mat> &outBlobs)
 {
-    ALLOC_ONLY = 1,
-    FORWARD_ONLY = 2,
-    ALLOC_AND_FORWARD = ALLOC_ONLY | FORWARD_ONLY
-};
+    size_t i, ninputs = inpBlobs.size();
+    std::vector<Mat> inp_(ninputs);
+    std::vector<Mat*> inp(ninputs);
+    std::vector<Mat> outp;
 
-typedef Ptr<std::vector<Blob*> > PtrToVecPtrBlob;
-
-PtrToVecPtrBlob
-runLayer(Ptr<Layer> layer, std::vector<Blob> &inpBlobs, std::vector<Blob> &outBlobs, int mode = ALLOC_AND_FORWARD)
-{
-    PtrToVecPtrBlob inpPtrs(new std::vector<Blob*>());
-    inpPtrs->reserve(inpBlobs.size());
-    for (size_t i = 0; i < inpBlobs.size(); i++)
-        inpPtrs->push_back(&inpBlobs[i]);
+    for( i = 0; i < ninputs; i++ )
+    {
+        inp_[i] = inpBlobs[i].clone();
+        inp[i] = &inp_[i];
+    }
 
-    if (mode & ALLOC_ONLY) layer->allocate(*inpPtrs, outBlobs);
-    if (mode & FORWARD_ONLY) layer->forward(*inpPtrs, outBlobs);
+    layer->allocate(inp, outp);
+    layer->forward(inp, outp);
 
-    return inpPtrs;
+    size_t noutputs = outp.size();
+    outBlobs.resize(noutputs);
+    for( i = 0; i < noutputs; i++ )
+        outBlobs[i] = outp[i];
 }
 
 
@@ -100,102 +102,59 @@ void testLayerUsingCaffeModels(String basename, bool useCaffeModel = false, bool
         importer->populateNet(net);
     }
 
-    Blob inp = blobFromNPY(inpfile);
-    Blob ref = blobFromNPY(outfile);
+    Mat inp = blobFromNPY(inpfile);
+    Mat ref = blobFromNPY(outfile);
 
     net.setBlob(".input", inp);
     net.forward();
-    Blob out = net.getBlob("output");
+    Mat out = net.getBlob("output");
 
     normAssert(ref, out);
 }
 
 TEST(Layer_Test_Softmax, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_softmax"));
-}
-OCL_TEST(Layer_Test_Softmax, Accuracy)
-{
-     OCL_ON(testLayerUsingCaffeModels("layer_softmax"));
-     OCL_OFF();
+     testLayerUsingCaffeModels("layer_softmax");
 }
 
 TEST(Layer_Test_LRN_spatial, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_lrn_spatial"));
-}
-OCL_TEST(Layer_Test_LRN_spatial, Accuracy)
-{
-     OCL_ON(testLayerUsingCaffeModels("layer_lrn_spatial"));
-     OCL_OFF();
+     testLayerUsingCaffeModels("layer_lrn_spatial");
 }
 
 TEST(Layer_Test_LRN_channels, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_lrn_channels"));
-}
-OCL_TEST(Layer_Test_LRN_channels, Accuracy)
-{
-    OCL_ON(testLayerUsingCaffeModels("layer_lrn_channels"));
-    OCL_OFF();
+     testLayerUsingCaffeModels("layer_lrn_channels");
 }
 
 TEST(Layer_Test_Convolution, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_convolution", true));
-}
-OCL_TEST(Layer_Test_Convolution, Accuracy)
-{
-     OCL_ON(testLayerUsingCaffeModels("layer_convolution", true));
-     OCL_OFF();
+     testLayerUsingCaffeModels("layer_convolution", true);
 }
 
 TEST(Layer_Test_DeConvolution, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_deconvolution", true, false));
-}
-
-OCL_TEST(Layer_Test_DeConvolution, Accuracy)
-{
-     OCL_ON(testLayerUsingCaffeModels("layer_deconvolution", true, false););
-     OCL_OFF();
+     testLayerUsingCaffeModels("layer_deconvolution", true, false);
 }
 
 TEST(Layer_Test_InnerProduct, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_inner_product", true));
-}
-OCL_TEST(Layer_Test_InnerProduct, Accuracy)
-{
-    OCL_ON(testLayerUsingCaffeModels("layer_inner_product", true));
-    OCL_OFF();
+     testLayerUsingCaffeModels("layer_inner_product", true);
 }
 
 TEST(Layer_Test_Pooling_max, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_pooling_max"));
-     OCL_ON();
-}
-OCL_TEST(Layer_Test_Pooling_max, Accuracy)
-{
-     OCL_ON(testLayerUsingCaffeModels("layer_pooling_max"));
-     OCL_OFF();
+     testLayerUsingCaffeModels("layer_pooling_max");
 }
 
 TEST(Layer_Test_Pooling_ave, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_pooling_ave"));
-     OCL_ON();
-}
-OCL_TEST(Layer_Test_Pooling_ave, Accuracy)
-{
-     OCL_ON(testLayerUsingCaffeModels("layer_pooling_ave"));
-     OCL_OFF();
+     testLayerUsingCaffeModels("layer_pooling_ave");
 }
 
 TEST(Layer_Test_MVN, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_mvn"));
+     testLayerUsingCaffeModels("layer_mvn");
 }
 
 TEST(Layer_Test_Reshape, squeeze)
@@ -204,20 +163,25 @@ TEST(Layer_Test_Reshape, squeeze)
     params.set("axis", 2);
     params.set("num_axes", 1);
 
-    Blob inp(BlobShape(4, 3, 1, 2));
-    std::vector<Blob*> inpVec(1, &inp);
-    std::vector<Blob> outVec;
+    int sz[] = {4, 3, 1, 2};
+    Mat inp(4, sz, CV_32F);
+    std::vector<Mat*> inpVec(1, &inp);
+    std::vector<Mat> outVec;
 
     Ptr<Layer> rl = LayerFactory::createLayerInstance("Reshape", params);
     rl->allocate(inpVec, outVec);
     rl->forward(inpVec, outVec);
 
-    EXPECT_EQ(outVec[0].shape(), BlobShape(4, 3, 2));
+    Mat& out = outVec[0];
+    std::vector<int> shape(out.size.p, out.size.p + out.dims);
+    int sh0[] = {4, 3, 2};
+    std::vector<int> shape0(sh0, sh0+3);
+    EXPECT_TRUE(shapeEqual(shape, shape0));
 }
 
 TEST(Layer_Test_BatchNorm, Accuracy)
 {
-     OCL_OFF(testLayerUsingCaffeModels("layer_batch_norm", true));
+     testLayerUsingCaffeModels("layer_batch_norm", true);
 }
 
 //template<typename XMat>
@@ -232,16 +196,15 @@ TEST(Layer_Test_BatchNorm, Accuracy)
 //}
 //TEST(Layer_Concat, Accuracy)
 //{
-//    OCL_OFF(test_Layer_Concat<Mat>());
+//    test_Layer_Concat<Mat>());
 //}
 //OCL_TEST(Layer_Concat, Accuracy)
 //{
 //    OCL_ON(test_Layer_Concat<Mat>());
-//    OCL_OFF();
+//    );
 //}
 
-template<typename XMat>
-void test_Reshape_Split_Slice_layers()
+static void test_Reshape_Split_Slice_layers()
 {
     Net net;
     {
@@ -250,46 +213,41 @@ void test_Reshape_Split_Slice_layers()
         importer->populateNet(net);
     }
 
-    Blob input(BlobShape(6, 12));
+    Mat input(6, 12, CV_32F);
     RNG rng(0);
-    rng.fill(input.getRef<XMat>(), RNG::UNIFORM, -1, 1);
+    rng.fill(input, RNG::UNIFORM, -1, 1);
 
     net.setBlob(".input", input);
     net.forward();
-    Blob output = net.getBlob("output");
+    Mat output = net.getBlob("output");
 
     normAssert(input, output);
 }
 TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
 {
-    OCL_OFF(test_Reshape_Split_Slice_layers<Mat>());
-}
-OCL_TEST(Layer_Test_Reshape_Split_Slice, Accuracy)
-{
-    OCL_ON(test_Reshape_Split_Slice_layers<UMat>());
-    OCL_OFF();
+    test_Reshape_Split_Slice_layers();
 }
 
 class Layer_LSTM_Test : public ::testing::Test
 {
 public:
     int numInp, numOut;
-    Blob Wh, Wx, b;
+    Mat Wh, Wx, b;
     Ptr<LSTMLayer> layer;
-    std::vector<Blob> inputs, outputs;
+    std::vector<Mat> inputs, outputs;
 
     Layer_LSTM_Test() {}
 
-    void init(const BlobShape &inpShape_, const BlobShape &outShape_)
+    void init(const std::vector<int> &inpShape_, const std::vector<int> &outShape_)
     {
-        numInp = inpShape_.total();
-        numOut = outShape_.total();
+        numInp = (int)shapeTotal(inpShape_);
+        numOut = (int)shapeTotal(outShape_);
 
-        Wh = Blob(BlobShape(4 * numOut, numOut));
-        Wx = Blob(BlobShape(4 * numOut, numInp));
-        b  = Blob(BlobShape(4 * numOut, 1));
+        Wh = Mat::ones(4 * numOut, numOut, CV_32F);
+        Wx = Mat::ones(4 * numOut, numInp, CV_32F);
+        b  = Mat::ones(4 * numOut, 1, CV_32F);
 
-        layer = LSTMLayer::create();
+        layer = LSTMLayer::create(LayerParams());
         layer->setWeights(Wh, Wx, b);
         layer->setOutShape(outShape_);
     }
@@ -297,27 +255,43 @@ class Layer_LSTM_Test : public ::testing::Test
 
 TEST_F(Layer_LSTM_Test, get_set_test)
 {
-    BlobShape TN(4);
-    BlobShape inpShape(5, 3, 2), inpResShape = TN + inpShape;
-    BlobShape outShape(3, 1, 2), outResShape = TN + outShape;
+    const int TN = 4;
+    std::vector<int> inpShape = makeShape(5, 3, 2);
+    std::vector<int> outShape = makeShape(3, 1, 2);
+    std::vector<int> inpResShape = concatShape(makeShape(TN), inpShape);
+    std::vector<int> outResShape = concatShape(makeShape(TN), outShape);
 
     init(inpShape, outShape);
     layer->setProduceCellOutput(true);
     layer->setUseTimstampsDim(false);
     layer->setOutShape(outShape);
 
-    layer->setC(Blob(outResShape));
-    layer->setH(Blob(outResShape));
+    Mat C((int)outResShape.size(), &outResShape[0], CV_32F);
+    randu(C, -1., 1.);
+    Mat H = C.clone();
+    randu(H, -1., 1.);
+    layer->setC(C);
+    layer->setH(H);
 
-    inputs.push_back(Blob(inpResShape));
+    Mat inp((int)inpResShape.size(), &inpResShape[0], CV_32F);
+    randu(inp, -1., 1.);
+
+    inputs.push_back(inp);
     runLayer(layer, inputs, outputs);
 
     EXPECT_EQ(2u, outputs.size());
-    EXPECT_EQ(outResShape, outputs[0].shape());
-    EXPECT_EQ(outResShape, outputs[1].shape());
 
-    EXPECT_EQ(outResShape, layer->getC().shape());
-    EXPECT_EQ(outResShape, layer->getH().shape());
+    printShape("outResShape", outResShape);
+    printShape("out0", getShape(outputs[0]));
+    printShape("out1", getShape(outputs[0]));
+    printShape("C", getShape(layer->getC()));
+    printShape("H", getShape(layer->getH()));
+
+    EXPECT_TRUE(shapeEqual(outResShape, getShape(outputs[0])));
+    EXPECT_TRUE(shapeEqual(outResShape, getShape(outputs[1])));
+
+    EXPECT_TRUE(shapeEqual(outResShape, getShape(layer->getC())));
+    EXPECT_TRUE(shapeEqual(outResShape, getShape(layer->getH())));
 
     EXPECT_EQ(0, layer->inputNameToIndex("x"));
     EXPECT_EQ(0, layer->outputNameToIndex("h"));
@@ -326,24 +300,24 @@ TEST_F(Layer_LSTM_Test, get_set_test)
 
 TEST(Layer_LSTM_Test_Accuracy_with_, CaffeRecurrent)
 {
-    Ptr<LSTMLayer> layer = LSTMLayer::create();
+    Ptr<LSTMLayer> layer = LSTMLayer::create(LayerParams());
 
-    Blob Wx = blobFromNPY(_tf("lstm.prototxt.w_0.npy"));
-    Blob Wh = blobFromNPY(_tf("lstm.prototxt.w_2.npy"));
-    Blob b  = blobFromNPY(_tf("lstm.prototxt.w_1.npy"));
+    Mat Wx = blobFromNPY(_tf("lstm.prototxt.w_0.npy"));
+    Mat Wh = blobFromNPY(_tf("lstm.prototxt.w_2.npy"));
+    Mat b  = blobFromNPY(_tf("lstm.prototxt.w_1.npy"));
     layer->setWeights(Wh, Wx, b);
 
-    Blob inp = blobFromNPY(_tf("recurrent.input.npy"));
-    std::vector<Blob> inputs(1, inp), outputs;
+    Mat inp = blobFromNPY(_tf("recurrent.input.npy"));
+    std::vector<Mat> inputs(1, inp), outputs;
     runLayer(layer, inputs, outputs);
 
-    Blob h_t_reference = blobFromNPY(_tf("lstm.prototxt.h_1.npy"));
+    Mat h_t_reference = blobFromNPY(_tf("lstm.prototxt.h_1.npy"));
     normAssert(h_t_reference, outputs[0]);
 }
 
 TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent)
 {
-    Ptr<RNNLayer> layer = RNNLayer::create();
+    Ptr<RNNLayer> layer = RNNLayer::create(LayerParams());
 
     layer->setWeights(
                 blobFromNPY(_tf("rnn.prototxt.w_0.npy")),
@@ -352,10 +326,10 @@ TEST(Layer_RNN_Test_Accuracy_with_, CaffeRecurrent)
                 blobFromNPY(_tf("rnn.prototxt.w_3.npy")),
                 blobFromNPY(_tf("rnn.prototxt.w_4.npy")) );
 
-    std::vector<Blob> output, input(1, blobFromNPY(_tf("recurrent.input.npy")));
+    std::vector<Mat> output, input(1, blobFromNPY(_tf("recurrent.input.npy")));
     runLayer(layer, input, output);
 
-    Blob h_ref = blobFromNPY(_tf("rnn.prototxt.h_1.npy"));
+    Mat h_ref = blobFromNPY(_tf("rnn.prototxt.h_1.npy"));
     normAssert(h_ref, output[0]);
 }
 
@@ -364,10 +338,10 @@ class Layer_RNN_Test : public ::testing::Test
 {
 public:
     int nX, nH, nO, nT, nS;
-    Blob Whh, Wxh, bh, Who, bo;
+    Mat Whh, Wxh, bh, Who, bo;
     Ptr<RNNLayer> layer;
 
-    std::vector<Blob> inputs, outputs;
+    std::vector<Mat> inputs, outputs;
 
     Layer_RNN_Test()
     {
@@ -377,13 +351,13 @@ class Layer_RNN_Test : public ::testing::Test
         nH = 64;
         nO = 100;
 
-        Whh = Blob(BlobShape(nH, nH));
-        Wxh = Blob(BlobShape(nH, nX));
-        bh  = Blob(BlobShape(nH, 1));
-        Who = Blob(BlobShape(nO, nH));
-        bo  = Blob(BlobShape(nO, 1));
+        Whh = Mat::ones(nH, nH, CV_32F);
+        Wxh = Mat::ones(nH, nX, CV_32F);
+        bh  = Mat::ones(nH, 1, CV_32F);
+        Who = Mat::ones(nO, nH, CV_32F);
+        bo  = Mat::ones(nO, 1, CV_32F);
 
-        layer = RNNLayer::create();
+        layer = RNNLayer::create(LayerParams());
         layer->setProduceHiddenOutput(true);
         layer->setWeights(Wxh, bh, Whh, Who, bo);
     }
@@ -391,12 +365,15 @@ class Layer_RNN_Test : public ::testing::Test
 
 TEST_F(Layer_RNN_Test, get_set_test)
 {
-    inputs.push_back(Blob(BlobShape(nT, nS, 1, nX)));
+    int sz[] = { nT, nS, 1, nX };
+    Mat inp(4, sz, CV_32F);
+    randu(inp, -1., 1.);
+    inputs.push_back(inp);
     runLayer(layer, inputs, outputs);
 
     EXPECT_EQ(outputs.size(), 2u);
-    EXPECT_EQ(outputs[0].shape(), BlobShape(nT, nS, nO));
-    EXPECT_EQ(outputs[1].shape(), BlobShape(nT, nS, nH));
+    EXPECT_TRUE(shapeEqual(getShape(outputs[0]), makeShape(nT, nS, nO)));
+    EXPECT_TRUE(shapeEqual(getShape(outputs[1]), makeShape(nT, nS, nH)));
 }
 
 }
diff --git a/modules/dnn/test/test_main.cpp b/modules/dnn/test/test_main.cpp
index 42917f29976..ff8ec044807 100644
--- a/modules/dnn/test/test_main.cpp
+++ b/modules/dnn/test/test_main.cpp
@@ -8,24 +8,4 @@ namespace cvtest
 using namespace cv;
 using namespace cv::dnn;
 
-TEST(BlobShape_SimpleConstr, Regression)
-{
-    BlobShape sd;
-
-    BlobShape s1(0);
-    EXPECT_EQ(s1.dims(), 1);
-    EXPECT_EQ(s1[0], 0);
-
-    BlobShape s2(0, 0);
-    EXPECT_EQ(s2.dims(), 2);
-    EXPECT_EQ(s2[0], 0);
-    EXPECT_EQ(s2[1], 0);
-}
-
-TEST(BlobShape_EmptyFill, Regression)
-{
-    BlobShape s(10, (int*)NULL);
-    EXPECT_EQ(s.dims(), 10);
-}
-
 }
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index bb73dd5cbb4..13690d39e9e 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -40,13 +40,13 @@ TEST(Test_TensorFlow, read_inception)
     resize(sample, input, Size(224, 224));
     input -= 128; // mean sub
 
-    dnn::Blob inputBlob = dnn::Blob::fromImages(input);
+    Mat inputBlob = blobFromImage(input, 1.);
 
     net.setBlob("_input.input", inputBlob);
     net.forward();
 
-    Blob out = net.getBlob("softmax2");
-    std::cout << out.dims() << std::endl;
+    Mat out = net.getBlob("softmax2");
+    std::cout << out.dims << std::endl;
 }
 
 TEST(Test_TensorFlow, inception_accuracy)
@@ -62,15 +62,13 @@ TEST(Test_TensorFlow, inception_accuracy)
     Mat sample = imread(_tf("grace_hopper_227.png"));
     ASSERT_TRUE(!sample.empty());
     resize(sample, sample, Size(224, 224));
-    cv::cvtColor(sample, sample, cv::COLOR_BGR2RGB);
-    dnn::Blob inputBlob = dnn::Blob::fromImages(sample);
+    Mat inputBlob = blobFromImage(sample, 1.);
 
     net.setBlob(".input", inputBlob);
     net.forward();
 
-    Blob out = net.getBlob("softmax2");
-
-    Blob ref = blobFromNPY(_tf("tf_inception_prob.npy"));
+    Mat out = net.getBlob("softmax2");
+    Mat ref = blobFromNPY(_tf("tf_inception_prob.npy"));
 
     normAssert(ref, out);
 }
diff --git a/modules/dnn/test/test_torch_importer.cpp b/modules/dnn/test/test_torch_importer.cpp
index dae3eda8e6f..3da0ccac78c 100644
--- a/modules/dnn/test/test_torch_importer.cpp
+++ b/modules/dnn/test/test_torch_importer.cpp
@@ -82,7 +82,7 @@ static void runTorchNet(String prefix, String outLayerName = "",
     ASSERT_TRUE(importer != NULL);
     importer->populateNet(net);
 
-    Blob inp, outRef;
+    Mat inp, outRef;
     ASSERT_NO_THROW( inp = readTorchBlob(_tf(prefix + "_input" + suffix), isBinary) );
     ASSERT_NO_THROW( outRef = readTorchBlob(_tf(prefix + "_output" + suffix), isBinary) );
 
@@ -90,14 +90,14 @@ static void runTorchNet(String prefix, String outLayerName = "",
     net.forward();
     if (outLayerName.empty())
         outLayerName = net.getLayerNames().back();
-    Blob out = net.getBlob(outLayerName);
+    Mat out = net.getBlob(outLayerName);
 
     normAssert(outRef, out);
 
     if (check2ndBlob)
     {
-        Blob out2 = net.getBlob(outLayerName + ".1");
-        Blob ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary);
+        Mat out2 = net.getBlob(outLayerName + ".1");
+        Mat ref2 = readTorchBlob(_tf(prefix + "_output_2" + suffix), isBinary);
         normAssert(out2, ref2);
     }
 }
@@ -169,15 +169,12 @@ TEST(Torch_Importer, ENet_accuracy)
     }
 
     Mat sample = imread(_tf("street.png", false));
-    cv::cvtColor(sample, sample, cv::COLOR_BGR2RGB);
-    sample.convertTo(sample, CV_32F, 1/255.0);
-    dnn::Blob inputBlob = dnn::Blob::fromImages(sample);
+    Mat inputBlob = blobFromImage(sample, 1./255);
 
     net.setBlob("", inputBlob);
     net.forward();
-    dnn::Blob out = net.getBlob(net.getLayerNames().back());
-
-    Blob ref = blobFromNPY(_tf("torch_enet_prob.npy", false));
+    Mat out = net.getBlob(net.getLayerNames().back());
+    Mat ref = blobFromNPY(_tf("torch_enet_prob.npy", false));
     normAssert(ref, out);
 }
 
diff --git a/modules/dnn/tutorials/tutorial_dnn_googlenet.markdown b/modules/dnn/tutorials/tutorial_dnn_googlenet.markdown
index 1eaaf251ac5..01bfca11704 100644
--- a/modules/dnn/tutorials/tutorial_dnn_googlenet.markdown
+++ b/modules/dnn/tutorials/tutorial_dnn_googlenet.markdown
@@ -41,7 +41,7 @@ Explanation
 
    Now image is actually a 3-dimensional array with 224x224x3 shape.
 
-   Next, we convert the image to 4-dimensional blob (so-called batch) with 1x3x224x224 shape by using special cv::dnn::Blob::fromImages constructor.
+   Next, we convert the image to 4-dimensional blob (so-called batch) with 1x3x224x224 shape by using special cv::dnn::blobFromImages constructor.
 
 -# Pass the blob to the network
    @snippet dnn/samples/caffe_googlenet.cpp Set input blob
diff --git a/modules/tracking/src/gtrTracker.cpp b/modules/tracking/src/gtrTracker.cpp
index 5fc9092528c..0ebdd55c9ae 100644
--- a/modules/tracking/src/gtrTracker.cpp
+++ b/modules/tracking/src/gtrTracker.cpp
@@ -165,19 +165,14 @@ bool TrackerGOTURNImpl::updateImpl(const Mat& image, Rect2d& boundingBox)
     searchPatch = searchPatch - 128;
 
     //Convert to Float type
-    targetPatch.convertTo(targetPatch, CV_32F);
-    searchPatch.convertTo(searchPatch, CV_32F);
-
-    dnn::Blob targetBlob = dnn::Blob::fromImages(targetPatch);
-    dnn::Blob searchBlob = dnn::Blob::fromImages(searchPatch);
+    Mat targetBlob = dnn::blobFromImage(targetPatch);
+    Mat searchBlob = dnn::blobFromImage(searchPatch);
 
     net.setBlob(".data1", targetBlob);
     net.setBlob(".data2", searchBlob);
 
     net.forward();
-    dnn::Blob res = net.getBlob("scale");
-
-    Mat resMat = res.matRefConst().reshape(1, 1);
+    Mat resMat = net.getBlob("scale").reshape(1, 1);
 
     curBB.x = targetPatchRect.x + (resMat.at<float>(0) * targetPatchRect.width / INPUT_SIZE) - targetPatchRect.width;
     curBB.y = targetPatchRect.y + (resMat.at<float>(1) * targetPatchRect.height / INPUT_SIZE) - targetPatchRect.height;