Skip to content

Commit

Permalink
Merge pull request #12243 from dkurt:dnn_tf_mask_rcnn
Browse files Browse the repository at this point in the history
* Support Mask-RCNN from TensorFlow

* Fix a sample
  • Loading branch information
dkurt authored and alalek committed Aug 24, 2018
1 parent 4f360f8 commit 472b71e
Show file tree
Hide file tree
Showing 9 changed files with 600 additions and 153 deletions.
7 changes: 7 additions & 0 deletions modules/dnn/src/layers/crop_and_resize_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,13 @@ class CropAndResizeLayerImpl CV_FINAL : public CropAndResizeLayer
}
}
}
if (boxes.rows < out.size[0])
{
// left = top = right = bottom = 0
std::vector<cv::Range> dstRanges(4, Range::all());
dstRanges[0] = Range(boxes.rows, out.size[0]);
out(dstRanges).setTo(inp.ptr<float>(0, 0, 0)[0]);
}
}

private:
Expand Down
50 changes: 40 additions & 10 deletions modules/dnn/src/layers/detection_output_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
// It's true whenever predicted bounding boxes and proposals are normalized to [0, 1].
bool _bboxesNormalized;
bool _clip;
bool _groupByClasses;

enum { _numAxes = 4 };
static const std::string _layerName;
Expand Down Expand Up @@ -183,6 +184,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
_locPredTransposed = getParameter<bool>(params, "loc_pred_transposed", 0, false, false);
_bboxesNormalized = getParameter<bool>(params, "normalized_bbox", 0, false, true);
_clip = getParameter<bool>(params, "clip", 0, false, false);
_groupByClasses = getParameter<bool>(params, "group_by_classes", 0, false, true);

getCodeType(params);

Expand Down Expand Up @@ -381,7 +383,7 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
{
count += outputDetections_(i, &outputsData[count * 7],
allDecodedBBoxes[i], allConfidenceScores[i],
allIndices[i]);
allIndices[i], _groupByClasses);
}
CV_Assert(count == numKept);
}
Expand Down Expand Up @@ -497,17 +499,44 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
{
count += outputDetections_(i, &outputsData[count * 7],
allDecodedBBoxes[i], allConfidenceScores[i],
allIndices[i]);
allIndices[i], _groupByClasses);
}
CV_Assert(count == numKept);
}

size_t outputDetections_(
const int i, float* outputsData,
const LabelBBox& decodeBBoxes, Mat& confidenceScores,
const std::map<int, std::vector<int> >& indicesMap
const std::map<int, std::vector<int> >& indicesMap,
bool groupByClasses
)
{
std::vector<int> dstIndices;
std::vector<std::pair<float, int> > allScores;
for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
{
int label = it->first;
if (confidenceScores.rows <= label)
CV_Error_(cv::Error::StsError, ("Could not find confidence predictions for label %d", label));
const std::vector<float>& scores = confidenceScores.row(label);
const std::vector<int>& indices = it->second;

const int numAllScores = allScores.size();
allScores.reserve(numAllScores + indices.size());
for (size_t j = 0; j < indices.size(); ++j)
{
allScores.push_back(std::make_pair(scores[indices[j]], numAllScores + j));
}
}
if (!groupByClasses)
std::sort(allScores.begin(), allScores.end(), util::SortScorePairDescend<int>);

dstIndices.resize(allScores.size());
for (size_t j = 0; j < dstIndices.size(); ++j)
{
dstIndices[allScores[j].second] = j;
}

size_t count = 0;
for (std::map<int, std::vector<int> >::const_iterator it = indicesMap.begin(); it != indicesMap.end(); ++it)
{
Expand All @@ -524,14 +553,15 @@ class DetectionOutputLayerImpl CV_FINAL : public DetectionOutputLayer
for (size_t j = 0; j < indices.size(); ++j, ++count)
{
int idx = indices[j];
int dstIdx = dstIndices[count];
const util::NormalizedBBox& decode_bbox = label_bboxes->second[idx];
outputsData[count * 7] = i;
outputsData[count * 7 + 1] = label;
outputsData[count * 7 + 2] = scores[idx];
outputsData[count * 7 + 3] = decode_bbox.xmin;
outputsData[count * 7 + 4] = decode_bbox.ymin;
outputsData[count * 7 + 5] = decode_bbox.xmax;
outputsData[count * 7 + 6] = decode_bbox.ymax;
outputsData[dstIdx * 7] = i;
outputsData[dstIdx * 7 + 1] = label;
outputsData[dstIdx * 7 + 2] = scores[idx];
outputsData[dstIdx * 7 + 3] = decode_bbox.xmin;
outputsData[dstIdx * 7 + 4] = decode_bbox.ymin;
outputsData[dstIdx * 7 + 5] = decode_bbox.xmax;
outputsData[dstIdx * 7 + 6] = decode_bbox.ymax;
}
}
return count;
Expand Down
16 changes: 11 additions & 5 deletions modules/dnn/src/layers/resize_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ class ResizeLayerImpl : public ResizeLayer
interpolation = params.get<String>("interpolation");
CV_Assert(interpolation == "nearest" || interpolation == "bilinear");

bool alignCorners = params.get<bool>("align_corners", false);
if (alignCorners)
CV_Error(Error::StsNotImplemented, "Resize with align_corners=true is not implemented");
alignCorners = params.get<bool>("align_corners", false);
}

bool getMemoryShapes(const std::vector<MatShape> &inputs,
Expand Down Expand Up @@ -66,8 +64,15 @@ class ResizeLayerImpl : public ResizeLayer
outHeight = outputs[0].size[2];
outWidth = outputs[0].size[3];
}
scaleHeight = static_cast<float>(inputs[0]->size[2]) / outHeight;
scaleWidth = static_cast<float>(inputs[0]->size[3]) / outWidth;
if (alignCorners && outHeight > 1)
scaleHeight = static_cast<float>(inputs[0]->size[2] - 1) / (outHeight - 1);
else
scaleHeight = static_cast<float>(inputs[0]->size[2]) / outHeight;

if (alignCorners && outWidth > 1)
scaleWidth = static_cast<float>(inputs[0]->size[3] - 1) / (outWidth - 1);
else
scaleWidth = static_cast<float>(inputs[0]->size[3]) / outWidth;
}

void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
Expand Down Expand Up @@ -166,6 +171,7 @@ class ResizeLayerImpl : public ResizeLayer
int outWidth, outHeight, zoomFactorWidth, zoomFactorHeight;
String interpolation;
float scaleWidth, scaleHeight;
bool alignCorners;
};


Expand Down
52 changes: 52 additions & 0 deletions modules/dnn/test/test_tf_importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -537,4 +537,56 @@ TEST(Test_TensorFlow, two_inputs)
normAssert(out, firstInput + secondInput);
}

TEST(Test_TensorFlow, Mask_RCNN)
{
std::string proto = findDataFile("dnn/mask_rcnn_inception_v2_coco_2018_01_28.pbtxt", false);
std::string model = findDataFile("dnn/mask_rcnn_inception_v2_coco_2018_01_28.pb", false);

Net net = readNetFromTensorflow(model, proto);
Mat img = imread(findDataFile("dnn/street.png", false));
Mat refDetections = blobFromNPY(path("mask_rcnn_inception_v2_coco_2018_01_28.detection_out.npy"));
Mat refMasks = blobFromNPY(path("mask_rcnn_inception_v2_coco_2018_01_28.detection_masks.npy"));
Mat blob = blobFromImage(img, 1.0f, Size(800, 800), Scalar(), true, false);

net.setPreferableBackend(DNN_BACKEND_OPENCV);

net.setInput(blob);

// Mask-RCNN predicts bounding boxes and segmentation masks.
std::vector<String> outNames(2);
outNames[0] = "detection_out_final";
outNames[1] = "detection_masks";

std::vector<Mat> outs;
net.forward(outs, outNames);

Mat outDetections = outs[0];
Mat outMasks = outs[1];
normAssertDetections(refDetections, outDetections, "", /*threshold for zero confidence*/1e-5);

// Output size of masks is NxCxHxW where
// N - number of detected boxes
// C - number of classes (excluding background)
// HxW - segmentation shape
const int numDetections = outDetections.size[2];

int masksSize[] = {1, numDetections, outMasks.size[2], outMasks.size[3]};
Mat masks(4, &masksSize[0], CV_32F);

std::vector<cv::Range> srcRanges(4, cv::Range::all());
std::vector<cv::Range> dstRanges(4, cv::Range::all());

outDetections = outDetections.reshape(1, outDetections.total() / 7);
for (int i = 0; i < numDetections; ++i)
{
// Get a class id for this bounding box and copy mask only for that class.
int classId = static_cast<int>(outDetections.at<float>(i, 1));
srcRanges[0] = dstRanges[1] = cv::Range(i, i + 1);
srcRanges[1] = cv::Range(classId, classId + 1);
outMasks(srcRanges).copyTo(masks(dstRanges));
}
cv::Range topRefMasks[] = {Range::all(), Range(0, numDetections), Range::all(), Range::all()};
normAssert(masks, refMasks(&topRefMasks[0]));
}

}
143 changes: 143 additions & 0 deletions samples/dnn/mask_rcnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import cv2 as cv
import argparse
import numpy as np

parser = argparse.ArgumentParser(description=
'Use this script to run Mask-RCNN object detection and semantic '
'segmentation network from TensorFlow Object Detection API.')
parser.add_argument('--input', help='Path to input image or video file. Skip this argument to capture frames from a camera.')
parser.add_argument('--model', required=True, help='Path to a .pb file with weights.')
parser.add_argument('--config', required=True, help='Path to a .pxtxt file contains network configuration.')
parser.add_argument('--classes', help='Optional path to a text file with names of classes.')
parser.add_argument('--colors', help='Optional path to a text file with colors for an every class. '
'An every color is represented with three values from 0 to 255 in BGR channels order.')
parser.add_argument('--width', type=int, default=800,
help='Preprocess input image by resizing to a specific width.')
parser.add_argument('--height', type=int, default=800,
help='Preprocess input image by resizing to a specific height.')
parser.add_argument('--thr', type=float, default=0.5, help='Confidence threshold')
args = parser.parse_args()

np.random.seed(324)

# Load names of classes
classes = None
if args.classes:
with open(args.classes, 'rt') as f:
classes = f.read().rstrip('\n').split('\n')

# Load colors
colors = None
if args.colors:
with open(args.colors, 'rt') as f:
colors = [np.array(color.split(' '), np.uint8) for color in f.read().rstrip('\n').split('\n')]

legend = None
def showLegend(classes):
global legend
if not classes is None and legend is None:
blockHeight = 30
assert(len(classes) == len(colors))

legend = np.zeros((blockHeight * len(colors), 200, 3), np.uint8)
for i in range(len(classes)):
block = legend[i * blockHeight:(i + 1) * blockHeight]
block[:,:] = colors[i]
cv.putText(block, classes[i], (0, blockHeight/2), cv.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255))

cv.namedWindow('Legend', cv.WINDOW_NORMAL)
cv.imshow('Legend', legend)
classes = None


def drawBox(frame, classId, conf, left, top, right, bottom):
# Draw a bounding box.
cv.rectangle(frame, (left, top), (right, bottom), (0, 255, 0))

label = '%.2f' % conf

# Print a label of class.
if classes:
assert(classId < len(classes))
label = '%s: %s' % (classes[classId], label)

labelSize, baseLine = cv.getTextSize(label, cv.FONT_HERSHEY_SIMPLEX, 0.5, 1)
top = max(top, labelSize[1])
cv.rectangle(frame, (left, top - labelSize[1]), (left + labelSize[0], top + baseLine), (255, 255, 255), cv.FILLED)
cv.putText(frame, label, (left, top), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0))


# Load a network
net = cv.dnn.readNet(args.model, args.config)
net.setPreferableBackend(cv.dnn.DNN_BACKEND_OPENCV)

winName = 'Mask-RCNN in OpenCV'
cv.namedWindow(winName, cv.WINDOW_NORMAL)

cap = cv.VideoCapture(args.input if args.input else 0)
legend = None
while cv.waitKey(1) < 0:
hasFrame, frame = cap.read()
if not hasFrame:
cv.waitKey()
break

frameH = frame.shape[0]
frameW = frame.shape[1]

# Create a 4D blob from a frame.
blob = cv.dnn.blobFromImage(frame, size=(args.width, args.height), swapRB=True, crop=False)

# Run a model
net.setInput(blob)

boxes, masks = net.forward(['detection_out_final', 'detection_masks'])

numClasses = masks.shape[1]
numDetections = boxes.shape[2]

# Draw segmentation
if not colors:
# Generate colors
colors = [np.array([0, 0, 0], np.uint8)]
for i in range(1, numClasses + 1):
colors.append((colors[i - 1] + np.random.randint(0, 256, [3], np.uint8)) / 2)
del colors[0]

boxesToDraw = []
for i in range(numDetections):
box = boxes[0, 0, i]
mask = masks[i]
score = box[2]
if score > args.thr:
classId = int(box[1])
left = int(frameW * box[3])
top = int(frameH * box[4])
right = int(frameW * box[5])
bottom = int(frameH * box[6])

left = max(0, min(left, frameW - 1))
top = max(0, min(top, frameH - 1))
right = max(0, min(right, frameW - 1))
bottom = max(0, min(bottom, frameH - 1))

boxesToDraw.append([frame, classId, score, left, top, right, bottom])

classMask = mask[classId]
classMask = cv.resize(classMask, (right - left + 1, bottom - top + 1))
mask = (classMask > 0.5)

roi = frame[top:bottom+1, left:right+1][mask]
frame[top:bottom+1, left:right+1][mask] = (0.7 * colors[classId] + 0.3 * roi).astype(np.uint8)

for box in boxesToDraw:
drawBox(*box)

# Put efficiency information.
t, _ = net.getPerfProfile()
label = 'Inference time: %.2f ms' % (t * 1000.0 / cv.getTickFrequency())
cv.putText(frame, label, (0, 15), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0))

showLegend(classes)

cv.imshow(winName, frame)
Loading

0 comments on commit 472b71e

Please sign in to comment.