## YOLOv1 구현

In [1]:
import numpy as np
import cv2
from functools import partial
import xmltodict
from tqdm import tqdm
import tensorflow as tf
import random
from glob import glob

In [2]:
max_num = len(tf.keras.applications.VGG16(weights='imagenet', include_top=False,  input_shape=(224, 224, 3)).layers) # 레이어 최대 개수

YOLO = tf.keras.models.Sequential(name = "YOLO")
for i in range(0, max_num-1):
    YOLO.add(tf.keras.applications.VGG16(weights='imagenet', include_top=False,  input_shape=(224, 224, 3)).layers[i])

initializer = tf.keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None)
regularizer = tf.keras.regularizers.l2(0.0005)

for layer in YOLO.layers:
    # 'kernel_regularizer' 속성이 있는 인스턴스를 찾아 regularizer를 추가
    if hasattr(layer, 'kernel_regularizer'):
        setattr(layer, 'kernel_regularizer', regularizer)

### 원문은 DarkNet을 썼지만 나는 구현을 쉽게 하기 위해 VGG16을 썼다.
### 여기에 따로 레이어를 얹어서 YOLO를 구현할거다

In [3]:
YOLO.add(tf.keras.layers.Conv2D(1024, (3, 3), kernel_initializer=initializer, padding = 'SAME' ,kernel_regularizer = regularizer, name = "detection_conv1", dtype='float32'))
YOLO.add(tf.keras.layers.Conv2D(1024, (3, 3), kernel_initializer=initializer, padding = 'SAME' ,kernel_regularizer = regularizer, name = "detection_conv2", dtype='float32'))
YOLO.add(tf.keras.layers.MaxPool2D((2, 2)))
YOLO.add(tf.keras.layers.Conv2D(1024, (3, 3), kernel_initializer=initializer, padding = 'SAME' ,kernel_regularizer = regularizer, name = "detection_conv3", dtype='float32'))
YOLO.add(tf.keras.layers.Conv2D(1024, (3, 3), kernel_initializer=initializer, padding = 'SAME' ,kernel_regularizer = regularizer, name = "detection_conv4", dtype='float32'))
# Linear 부분
YOLO.add(tf.keras.layers.Flatten())
YOLO.add(tf.keras.layers.Dense(4096, activation= None, kernel_initializer = initializer, name = "detection_linear1", dtype='float32'))
YOLO.add(tf.keras.layers.Dropout(.5))
YOLO.add(tf.keras.layers.Dense(1470, activation=partial(tf.nn.leaky_relu, alpha=0.01), kernel_initializer = initializer, name = "detection_linear2", dtype='float32')) # 7*7*30 = 1470. 0~29 : (0, 0) 위치의 픽셀에 대한 각종 출력값, 30~59 : (1, 0) 위치의...블라블라
YOLO.add(tf.keras.layers.Reshape((7, 7, 30), name = "output", dtype='float32'))

In [4]:
YOLO.summary()

Model: "YOLO"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 56, 56, 256)       295168 

In [None]:
def yolo_multitask_loss(y_true, y_pred): # 커스텀 손실함수
    
    # 계산을 위해 label을 tensor로 만들어준다. 
    y_true = tf.convert_to_tensor(y_true)

    # box = [x,y,w,h,confidence_score]
    pred_box_batch_1 = y_pred[..., :5] # [-1,7,7,5]
    pred_box_batch_2 = y_pred[..., 5:10] # [-1,7,7,5]
    pred_class_batch = y_pred[..., 10:] # [-1,7,7,20]

    # y_true는 ground truth box(0~4) + class(5~24)로 구성되어 있다 
    true_box_batch = y_true[..., :5] # [-1,7,7,5]
    true_class_batch = y_true[..., 5:] # [-1,7,7,20]

    # YOLOv1의 Loss function은 3개로 나뉜다. localization, confidence, classification
    # localization은 추측한 box랑 ground truth box의 오차
    # confidencee는 Pr(Object) * IOU, classification은 Pr(Class|Object) = 객체가 있을 때 해당 객체일 확률이다. 어떻게 계산하지?

    # loss를 계산 
    for boxes_pred_1, boxes_pred_2, class_pred, boxes_label, class_label in pred_box_batch_1, pred_box_batch_2, pred_class_batch, true_box_batch, true_class_batch :
        # boxes_pred_1 : [7,7,5], boxes_pred_2 : [7,7,5], boxes_label : [7,7,5], class_pred : [7,7,20], class_label : [7,7,20]
        boxes_pred_1 = tf.reshape(boxes_pred_1, [49, 5])
        boxes_pred_2 = tf.reshape(boxes_pred_2, [49, 5])
        class_pred = tf.reshape(boxes_pred_2, [49, 20])

        boxes_label = tf.reshape(boxes_label, [49, 5])
        class_label = tf.reshape(class_label, [49, 20])

        for box_pred_1, box_pred_2, class_pred_oneCell, box_label, class_label_oneCell in boxes_pred_1, boxes_pred_2, class_pred, boxes_label, class_label :
            # 한 셀에서 responsible한 box를 골라 localization error를 구해야함. reponsible한 box가 아니면 손실값은 0이 되며 confidence loss, classification loss를 구할 때도 reponsible한 box일 경우에만 계산을 수행함.
            # responsible한 box는 IoU를 기준으로 판단함

            # IoU 구하기

            # localization error 구하기(x,y,w,h). x, y는 해당 grid cell의 중심 좌표와 offset이고 w, h는 전체 이미지에 대해 정규화된 값이다. 즉, 범위가 0~1이다.

            # confidence error 구하기(confidence score : class * IoU ). label의 경우 답인 객체는 1 * ()고 아니면 0*()가 된다. 

            # classification error 구하기(tensorflow 내장 함수 tf.nn.softmax_cross_entropy_with_logits(label, logits) 사용))

    

# 훈련 계획
## epoch : 135
## batch size : 64
## momentum : 0.9
## weight decay : 0.0005
## learning rate : 0.01(1~75), 0.001(76~105), 0.0001(106~135)

## data augmentation : random scaling, translation을 원래 이미지 사이즈의 20%까지만 수행

In [None]:
# 파일 경로
train_x_path = '/home/ubuntu/CUAI_2021/Advanced_Minkyu_Kim/PASCAL_VOC_2007/train/VOCdevkit/VOC2007/JPEGImages'
train_y_path = '/home/ubuntu/CUAI_2021/Advanced_Minkyu_Kim/PASCAL_VOC_2007/train/VOCdevkit/VOC2007/Annotations'

# 파일 경로 휙득
list_train_x = sorted([x for x in glob(train_x_path + '/**')])    
list_train_y = sorted([x for x in glob(train_y_path + '/**')]) 

image_file_list = sorted([x for x in glob(train_x_path + '/**')])
xml_file_list = sorted([x for x in glob(train_y_path + '/**')])

In [None]:
# 이미지에 어떤 Ground Truth Box가 있는지(label 휙득)
def get_Ground_Truth_Box_fromImage(xml_file_path): # xml_file_path은 파일 하나의 경로를 나타낸다

    f = open(xml_file_path)
    xml_file = xmltodict.parse(f.read()) 

    Image_Height = float(xml_file['annotation']['size']['height'])
    Image_Width  = float(xml_file['annotation']['size']['width'])

    Ground_Truth_Box_list = [] 

    # multi-objects in image
    try:
        for obj in xml_file['annotation']['object']:
            
            # 박스 좌표(왼쪽 위, 오른쪽 아래) 얻기
            x_min = float(obj['bndbox']['xmin']) 
            y_min = float(obj['bndbox']['ymin'])
            x_max = float(obj['bndbox']['xmax']) 
            y_max = float(obj['bndbox']['ymax'])

            # 224*224에 맞게 변형시켜줌
            x_min = float((224/Image_Width)*x_min)
            y_min = float((224/Image_Height)*y_min)
            x_max = float((224/Image_Width)*x_max)
            y_max = float((224/Image_Height)*y_max)

            Ground_Truth_Box = [x_min, y_min, x_max, y_max]
            Ground_Truth_Box_list.append(Ground_Truth_Box)

    # single-object in image
    except TypeError as e : 
        # 박스 좌표(왼쪽 위, 오른쪽 아래) 얻기
        x_min = float(xml_file['annotation']['object']['bndbox']['xmin']) 
        y_min = float(xml_file['annotation']['object']['bndbox']['ymin']) 
        x_max = float(xml_file['annotation']['object']['bndbox']['xmax']) 
        y_max = float(xml_file['annotation']['object']['bndbox']['ymax']) 

        # 224*224에 맞게 변형시켜줌
        x_min = float((224/Image_Width)*x_min)
        y_min = float((224/Image_Height)*y_min)
        x_max = float((224/Image_Width)*x_max)
        y_max = float((224/Image_Height)*y_max)

        Ground_Truth_Box = [x_min, y_min, x_max, y_max]  
        Ground_Truth_Box_list.append(Ground_Truth_Box)

    
    Ground_Truth_Box_list = np.asarray(Ground_Truth_Box_list)
    Ground_Truth_Box_list = np.reshape(Ground_Truth_Box_list, (-1, 4))

    return Ground_Truth_Box_list # 이미지에 있는 Ground Truth Box 리스트 받기(numpy)


In [None]:
# 입력용 이미지 생성.
# 우선 이미지를 얻고 data augmentation 수행
def make_input(image_file_list): 
    images_list = []
    
    for i in tqdm(range(0, len(image_file_list)), desc="get image") :
    
        image = cv2.imread(image_file_list[i])
        images_list.append(image)
    
    return np.asarray(images_list)

## Data Augmentation
출처 : https://towardsdatascience.com/complete-image-augmentation-in-opencv-31a6b02694f5

In [None]:
def fill(img, h, w):
    img = cv2.resize(img, (h, w), cv2.INTER_CUBIC)
    return img

# 0.2로 설정하면 될듯(up to 20% of the original image size라고 해서)
def horizontal_shift(img, ratio=0.0):
    if ratio > 1 or ratio < 0:
        print('Value should be less than 1 and greater than 0')
        return img
    ratio = random.uniform(-ratio, ratio)
    h, w = img.shape[:2]
    to_shift = w*ratio
    if ratio > 0:
        img = img[:, :int(w-to_shift), :]
    if ratio < 0:
        img = img[:, int(-1*to_shift):, :]
    img = fill(img, h, w)
    return img

def vertical_shift(img, ratio=0.0):
    if ratio > 1 or ratio < 0:
        print('Value should be less than 1 and greater than 0')
        return img
    ratio = random.uniform(-ratio, ratio)
    h, w = img.shape[:2]
    to_shift = h*ratio
    if ratio > 0:
        img = img[:int(h-to_shift), :, :]
    if ratio < 0:
        img = img[int(-1*to_shift):, :, :]
    img = fill(img, h, w)
    return img

def zoom(img, value): # 전체 이미지의 value(0~1)만큼만 가져가는거니까 20%면 0.8로?
    if value > 1 or value < 0:
        print('Value for zoom should be less than 1 and greater than 0')
        return img
    value = random.uniform(value, 1)
    h, w = img.shape[:2]
    h_taken = int(value*h)
    w_taken = int(value*w)
    h_start = random.randint(0, h-h_taken)
    w_start = random.randint(0, w-w_taken)
    img = img[h_start:h_start+h_taken, w_start:w_start+w_taken, :]
    img = fill(img, h, w)
    return img


In [None]:
# 밝기 조정(The more the value of Saturation and Value matrices the greater is the brightness)

def brightness(img, low, high): # low = 0.5, high = 1.5로 설정하면 될듯?
    value = random.uniform(low, high) # low~high 사이 랜덤한 값을 기존 saturation에다 곱한다
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hsv = np.array(hsv, dtype = np.float64)
    hsv[:,:,1] = hsv[:,:,1]*value
    hsv[:,:,1][hsv[:,:,1]>255]  = 255
    hsv[:,:,2] = hsv[:,:,2]*value 
    hsv[:,:,2][hsv[:,:,2]>255]  = 255
    hsv = np.array(hsv, dtype = np.uint8)
    img = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    return img