In [3]:
#encoding:utf-8
#
#created by xiongzihua
#
import torch
from torch.autograd import Variable
import torch.nn as nn

from net import vgg16
import torchvision.transforms as transforms
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os

In [4]:
VOC_CLASSES = (    # always index 0
    'aeroplane', 'bicycle', 'bird', 'boat',
    'bottle', 'bus', 'car', 'cat', 'chair',
    'cow', 'diningtable', 'dog', 'horse',
    'motorbike', 'person', 'pottedplant',
    'sheep', 'sofa', 'train', 'tvmonitor')

In [5]:
def decoder(pred):
    '''
    pred (tensor) 1x7x7x30
    return (tensor) box[[x1,y1,x2,y2]] label[...]
    '''
    boxes=[]
    cls_indexs=[]
    probs = []
    cell_size = 1./7
    pred = pred.data
    pred = pred.squeeze(0) #7x7x30
    contain1 = pred[:,:,4].unsqueeze(2)
    contain2 = pred[:,:,9].unsqueeze(2)
    contain = torch.cat((contain1,contain2),2)
    mask1 = contain > 0.7 #大于阈值
    mask2 = (contain==contain.max()) #we always select the best contain_prob what ever it>0.9
    mask = (mask1+mask2).gt(0)
    min_score,min_index = torch.min(mask,2) #每个cell只选最大概率的那个预测框
    for i in range(7):
        for j in range(7):
            for b in range(2):
                index = min_index[i,j]
#                 mask[i,j,index] = 0
                if mask[i,j,b] == 1:
                    #print(i,j,b)
                    box = pred[i,j,b*5:b*5+4]
                    contain_prob = torch.FloatTensor([pred[i,j,b*5+4]])
                    xy = torch.FloatTensor([j,i])*cell_size #cell左上角  up left of cell
                    box[:2] = box[:2]*cell_size + xy # return cxcy relative to image
                    box_xy = torch.FloatTensor(box.size())#转换成xy形式    convert[cx,cy,w,h] to [x1,xy1,x2,y2]
                    box_xy[:2] = box[:2] - 0.5*box[2:]
                    box_xy[2:] = box[:2] + 0.5*box[2:]
                    max_prob,cls_index = torch.max(pred[i,j,10:],0)
                    boxes.append(box_xy.view(1,4))
                    cls_indexs.append(cls_index)
                    probs.append(contain_prob)
    boxes = torch.cat(boxes,0) #(n,4)
    probs = torch.cat(probs,0) #(n,)
    cls_indexs = torch.cat(cls_indexs,0) #(n,)
    keep = nms(boxes,probs)
    return boxes[keep],cls_indexs[keep],probs[keep]
#     return boxes,cls_indexs,probs

In [6]:
def nms(bboxes,scores,threshold=0.5):
    '''
    bboxes(tensor) [N,4]
    scores(tensor) [N,]
    '''
    x1 = bboxes[:,0]
    y1 = bboxes[:,1]
    x2 = bboxes[:,2]
    y2 = bboxes[:,3]
    areas = (x2-x1) * (y2-y1)

    _,order = scores.sort(0,descending=True)
    keep = []
    while order.numel() > 0:
        i = order[0]
        keep.append(i)

        if order.numel() == 1:
            break

        xx1 = x1[order[1:]].clamp(min=x1[i])
        yy1 = y1[order[1:]].clamp(min=y1[i])
        xx2 = x2[order[1:]].clamp(max=x2[i])
        yy2 = y2[order[1:]].clamp(max=y2[i])

        w = (xx2-xx1).clamp(min=0)
        h = (yy2-yy1).clamp(min=0)
        inter = w*h

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        ids = (ovr<=threshold).nonzero().squeeze()
        if ids.numel() == 0:
            break
        order = order[ids+1]
    return torch.LongTensor(keep)

In [7]:
#
#start predict one image
#
def predict_gpu(model,image_name,root_path=''):

    result = []
    image = cv2.imread(root_path+image_name)
    h,w,_ = image.shape
    img = cv2.resize(image,(224,224))
    img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    mean = (123,117,104)#RGB
    img = img - np.array(mean,dtype=np.float32)

    transform = transforms.Compose([transforms.ToTensor(),])
    img = transform(img)
    img = Variable(img[None,:,:,:],volatile=True)
    img = img.cuda()

    pred = model(img) #1x7x7x30
    pred = pred.cpu()
    boxes,cls_indexs,probs =  decoder(pred)

    for i,box in enumerate(boxes):
        x1 = int(box[0]*w)
        x2 = int(box[2]*w)
        y1 = int(box[1]*h)
        y2 = int(box[3]*h)
        cls_index = cls_indexs[i]
        cls_index = int(cls_index) # convert LongTensor to int
        prob = probs[i]
        prob = float(prob)
        result.append([(x1,y1),(x2,y2),VOC_CLASSES[cls_index],image_name,prob])
    return result
        



if __name__ == '__main__':
    model = vgg16(pretrained=False)
    model.classifier = nn.Sequential(
                nn.Linear(512 * 7 * 7, 4096),
                nn.ReLU(True),
                nn.Dropout(),
                #nn.Linear(4096, 4096),
                #nn.ReLU(True),
                #nn.Dropout(),
                nn.Linear(4096, 1470),
            )
    model.load_state_dict(torch.load('best.pth'))
    model.eval()
    model.cuda()
    

In [10]:
i = cv2.imread('Cat.jpg')
result = predict_gpu(model,'Cat.jpg')
for left_up,right_bottom,class_name,_,prob in result:
    cv2.rectangle(i,left_up,right_bottom,(0,255,0),2)
    cv2.putText(i,class_name,left_up,cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),1,2)
    print(prob)

cv2.imwrite('Cat_Result_NO_NMS.jpg',i)

0.713312923908


True

In [8]:
root_dir = 'VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/'
img_names = os.listdir(root_dir)
for i,name in enumerate(img_names):
    if i < 15:
        pass
    else:
        image_name = root_dir+name
        print(image_name)
        image = cv2.imread(image_name)
        result = predict_gpu(model,image_name)
        for left_up,right_bottom,class_name,_,prob in result:
            cv2.rectangle(image,left_up,right_bottom,(0,255,0),2)
            cv2.putText(image,class_name,left_up,cv2.FONT_HERSHEY_SIMPLEX,1,(0,0,255),1,2)
            print(prob)

        cv2.imwrite('train_result'+str(i)+'.jpg',image)
    if i > 30:
        break

VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2008_008683.jpg
0.987594485283
0.947912335396
0.941967189312
0.883013606071
0.87505710125
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2010_002763.jpg
0.871656537056
0.765147686005
0.753469884396
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2011_003534.jpg
0.887986421585
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2011_001600.jpg
0.963555455208
0.832829594612
0.822455108166
0.753276467323
0.70083373785
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2010_005309.jpg
0.952035307884
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2008_007390.jpg
0.766653835773
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2008_002369.jpg
0.991786420345
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2008_004522.jpg
0.89200937748
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2008_004950.jpg
0.93240404129
VOC_Datasets/VOC_2012_training/VOC2012/JPEGImages/2008_008545.jpg
0.97771191597
VOC_Datasets/VOC_2012_training/VOC2012/JPEGIm

In [25]:
model = vgg16(pretrained=False)
model.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            #nn.Linear(4096, 4096),
            #nn.ReLU(True),
            #nn.Dropout(),
            nn.Linear(4096, 1470),
        )
model.load_state_dict(torch.load('best.pth'))
model.eval()
model.cuda()
    

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), dilation=(1, 1), ceil_mode=False)
    