# Prepare the codebase



In [None]:
# Download repo
!git clone https://github.com/lpraat/autoagent
%cd autoagent

In [None]:
# Set pythonpath
%env PYTHONPATH=.

In [None]:
# Install required libraries
!pip install -r requirements.txt
# Also install gdown to ease google drive downloads
!pip install gdown

# Inference

*NOTE*: By default, this repo uses the same model architecture used in the [Yolo-v5 Repository](https://github.com/ultralytics/yolov5.). You can find the definition in the /autoagent/models/vision/yolo/model_v5.py file.

### Params
We start by creating a yaml file, which is a config file containing all the parameters defining a model and its usage. An example config file can be found in /autoagent/models/vision/yolo/config/params.yaml.


Below, you can find an hopefully-exhaustive description for each parameter. We now focus on those that are relevant for inference, and we will look at the remaining ones in the Training section below.

In [4]:
params = {
  # Model Structure
  # The model architecture, in this case v5 (small, medium, or large)
  # In the example, we will load pre-trained weights for v5-large
  'version': 'v5-large',
  # Which activation to use, we select hswish as the original v5 weights
  # work with that
  'act': 'hswish',  # leaky, hswish, or add your preferred one
  # Kind of convolution, leave it as-is as only conv type is defined
  'conv': 'conv',
  # Whether to initialize biases or not, irrelevant for inference.
  'init_biases': True,

  # Optimization
  # Optimization parameters are irrelevant for inference
  # Leave them as they are
  # Reduction function used to compute final loss (either sum or mean)
  'reduction': 'mean',
  # Final loss is a weighted sum of localization, detection, and class loss
  'loc_w': 0.05,
  'det_w': 1,
  'cls_w': 0.125,
  # How much ciou loss influences detection loss (from 0 to 1)
  'ciou_ratio': 1,
  # Optimizer
  'optim': 'sgd',
  # Warmup epochs
  'warmup': 3,
  # Total number of epochs
  'num_epochs': 300,
  # Initial learning rate
  # Lr warmups from 0 to 0.01 during the first warmup epochs
  # then it is annealed using a cosine scheduler to final_lr
  'init_lr': 0.01,
  # Final learning rate
  'final_lr': 0.002,
  # Momentum warmups from 0.8 to 0.937 during the first warmup epochs
  'warmup_momentum': 0.8,
  'momentum': 0.937,
  # Regularizer
  'weight_decay': 0.0005,
  # Balance coefficients for detection loss
  'balance': [0.4, 1, 4],  # Large , medium, small output grids
  # Exponential moving average parameters 
  # (https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage)
  'ema_decay': 0.9999,
  'ema_exp_d': 2000,
  'ema_mode': 'exp',

  # Eval
  # Evaluation parameters are irrelevant for inference
  # Leave them as they are
  # Minimum acceptance confidence threshold
  'confidence_thresh': 0.001,
  # Non-maximum suppression threshold
  'nms_thresh': 0.4,

  # Img
  # Augmentation parameters, irrelevant for inference
  # Leave them as they are
  # Sequence of augmentations to eventually apply to each image at training time
  'augments': ['delta_bright', 'hflip'],
  # Mosaic augmentation
  # Probability to build a mosaic instead of a single image
  'mosaic_prob': 1,
  # Parameters influencing the affine transform used to build the mosaic
  # See autoagents/models/yolo/data_wrapper
  'mosaic_delta': 0.1,
  'mosaic_scale': 0.5,
  'mosaic_translate': 0.1,

  # Target
  # These parameters define how a prediction grid is transformed
  # to retrieve the bbox predictions
  # Anchor priors, as defined in v5
  'anchor_priors': [
    [[10, 13], [16, 30], [33, 23]],
    [[30, 61], [62, 45], [59, 119]],
    [[116, 90], [156, 198], [373, 326]]
  ],
  # Downsampled size at each prediction grid
  # Step size at each "grid division". The v5 network structure downsamples
  # the input image by a factor of 32, which is the output of the first grid (13x13 in case of a 416x416 input)
  # the input image by a factor of 16, which is the output of the second grid (26x26) etc...
  'steps': [32, 16, 8],
  # Mode and mult_thresh are irrelevant for inference, leave them as they are
  # they are used in the training phase to decide whether to map a target bounding box
  # to a given anchor prior (multi_anchors introduced in v4). This repo supports two modes:
  # 1) mult mode, the one used in yolov5, which assigns a bbox to an anchor prior
  #    if bbox_width <= anchor_prior_width * mult_thresh and 
  #       bbox_height <= anchor_prior_height * mult_thresh
  # 2) iou mode, which assigns a bbox to an anchor prior
  #    if iou(bbox, anchor) >= iou_thresh
  'mode': 'mult',
  'mult_thresh': [4, 4, 4],
  # mode: 'iou'
  # iou_thresh: [0.2, 0.2, 0.2]
  # The activation function used to obtain final bboxes from anchor_priors
  # either sigmoid or exp
  # 1) exp as in v4, bbox_wh = exp(pred_wh) * anchor_prior_wh
  # 2) sigmoid as in v5, bbox_wh = (2*sigmoid(pred_wh))**2 * anchor_prior_wh
  # Note that in case of sigmoid, at most we can output 4 times anchor_prior_wh
  # and that's why the mult_thresh above is set to 4
  'bbox_fn': 'sigmoid',

  # COCO names
  # We load pretrained weights from yolov5 repo, which are trained on COCO dataset
  'cls_names': [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
  ]
}

# Dump the parameters on a file, which we will provide for inference
import yaml
with open('./inference_params.yaml', mode='w') as f:
  yaml.dump(params, f)

## Try Inference on an Image

In [None]:
# Download pretrained Yolo-v5 COCO weights
!gdown https://drive.google.com/uc?id=1wo7ftxMRxUhFNml7e2_Zl9rGqutQEJ6x
!unzip yolov5weights.zip

In [None]:
# View sample usage
!python ./autoagent/models/vision/yolo/run.py --help

In [7]:
# Run inference at 720p on a sample image
!python ./autoagent/models/vision/yolo/run.py \
    --params ./inference_params.yaml \
    --ckpt ./yolov5weights/v5l.pt \
    --source ./autoagent/models/vision/yolo/sample_data/pic.png \
    --img_dim 1280 \
    --conf_thresh 0.25 \
    --nms_thresh 0.45 \
    --half_precision --cuda \
    --save_path ./result.jpg

In [None]:
# Visualize the results
import cv2
from google.colab.patches import cv2_imshow
result = cv2.imread('./result.jpg')
cv2_imshow(result)

# Training
We will train the model on the [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) dataset (2007trainval + 2012trainval for training, 2007 for validation), which is much smaller than the [COCO](https://cocodataset.org/#home) dataset and more suitable to show a training sample run.

In [9]:
params = {
  # Model Structure
  # The model architecture, in this case v5 (small, medium, or large)
  'version': 'v5-large',
  # Which activation to use
  'act': 'hswish',  # leaky, hswish, or add your preferred one
  # Kind of convolution, leave it as-is as only conv type is defined
  'conv': 'conv',
  # Whether to initialize biases or not.
  'init_biases': True,

  # Optimization
  # Reduction function used to compute final loss (either sum or mean)
  'reduction': 'mean',
  # Final loss is a weighted sum of localization, detection, and class loss
  'loc_w': 0.05,
  'det_w': 1,
  'cls_w': 0.125,
  # How much ciou loss influences detection loss (from 0 to 1)
  'ciou_ratio': 1,
  # Optimizer
  'optim': 'sgd',
  # Warmup epochs
  'warmup': 3,
  # Total number of epochs
  'num_epochs': 300,
  # Initial learning rate
  # Lr warmups from 0 to 0.01 during the first warmup epochs
  # then it is annealed using a cosine scheduler to final_lr
  'init_lr': 0.01,
  # Final learning rate
  'final_lr': 0.002,
  # Momentum warmups from 0.8 to 0.937 during the first warmup epochs
  'warmup_momentum': 0.8,
  'momentum': 0.937,
  # Regularizer
  'weight_decay': 0.0005,
  # Balance coefficients for detection loss
  'balance': [0.4, 1, 4],  # Large , medium, small output grids
  # Exponential moving average parameters 
  # (https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage)
  'ema_decay': 0.9999,
  'ema_exp_d': 2000,
  'ema_mode': 'sigmoid',

  # Eval
  # Minimum acceptance confidence threshold
  'confidence_thresh': 0.001,
  # Non-maximum suppression threshold
  'nms_thresh': 0.4,

  # Img
  # Sequence of augmentations to eventually apply to each image at training time
  'augments': ['delta_bright', 'hflip'],
  # Mosaic augmentation
  # Probability to build a mosaic instead of a single image
  'mosaic_prob': 1,
  # Parameters influencing the affine transform used to build the mosaic
  # See autoagents/models/yolo/data_wrapper
  'mosaic_delta': 0.1,
  'mosaic_scale': 0.5,
  'mosaic_translate': 0.1,

  # Target
  # These parameters define how a prediction grid is transformed
  # to retrieve the bbox predictions
  # Anchor priors, as defined in v5
  'anchor_priors': [
    [[10, 13], [16, 30], [33, 23]],
    [[30, 61], [62, 45], [59, 119]],
    [[116, 90], [156, 198], [373, 326]]
  ],
  # Downsampled size at each prediction grid
  # Step size at each "grid division". The v5 network structure downsamples
  # the input image by a factor of 32, which is the output of the first grid (13x13 in case of a 416x416 input)
  # the input image by a factor of 16, which is the output of the second grid (26x26) etc...
  'steps': [32, 16, 8],
  # Mode and mult_thresh are irrelevant for inference, leave them as they are
  # they are used in the training phase to decide whether to map a target bounding box
  # to a given anchor prior (multi_anchors introduced in v4). This repo supports two modes:
  # 1) mult mode, the one used in yolov5, which assigns a bbox to an anchor prior
  #    if bbox_width <= anchor_prior_width * mult_thresh and 
  #       bbox_height <= anchor_prior_height * mult_thresh
  # 2) iou mode, which assigns a bbox to an anchor prior
  #    if iou(bbox, anchor) >= iou_thresh
  'mode': 'mult',
  'mult_thresh': [4, 4, 4],
  # mode: 'iou'
  # iou_thresh: [0.2, 0.2, 0.2]
  # The activation function used to obtain final bboxes from anchor_priors
  # either sigmoid or exp
  # 1) exp as in v4, bbox_wh = exp(pred_wh) * anchor_prior_wh
  # 2) sigmoid as in v5, bbox_wh = (2*sigmoid(pred_wh))**2 * anchor_prior_wh
  # Note that in case of sigmoid, at most we can output 4 times anchor_prior_wh
  # and that's why the mult_thresh above is set to 4
  'bbox_fn': 'sigmoid',

  # VOC names
  'cls_names': [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
    'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
    'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
  ]
}

# Dump the parameters on a file, which we will provide for inference
import yaml
with open('./training_params.yaml', mode='w') as f:
  yaml.dump(params, f)

## VOC training

In [None]:
# Download the dataset
!gdown https://drive.google.com/uc?id=1sTZHfpD6y37TBkWKFE4lz1EIaJGqCIty

In [None]:
# Place the data under /data folder
!mkdir ./data
!unrar x voc.rar ./data

In [None]:
# View sample usage
!python ./autoagent/models/vision/yolo/train.py --help

In [None]:
# Start training
# NOTE: The average precision is not computed (and set to 0) during warmup epochs (3)
!python ./autoagent/models/vision/yolo/train.py \
  --params ./training_params.yaml \
  --data voc --batch_size 64 --aggregate 1 \
  --img_dim 416 --num_workers 4

# Fine-tuning
Let's fine tune a pre-trained v5-large model on VOC.

In [7]:
 params = {
  # Model Structure
  # The model architecture, in this case v5 (small, medium, or large)
  'version': 'v5-large',
  # Which activation to use
  'act': 'hswish',  # leaky, hswish, or add your preferred one
  # Kind of convolution, leave it as-is as only conv type is defined
  'conv': 'conv',
  # Whether to initialize biases or not.
  'init_biases': True,

  # Optimization
  # Reduction function used to compute final loss (either sum or mean)
  'reduction': 'mean',
  # Final loss is a weighted sum of localization, detection, and class loss
  'loc_w': 0.05,
  'det_w': 1,
  'cls_w': 0.125,
  # How much ciou loss influences detection loss (from 0 to 1)
  'ciou_ratio': 1,
  # Optimizer
  'optim': 'sgd',
  # Warmup epochs
  'warmup': 1,
  # Total number of epochs
  'num_epochs': 100,
  # Initial learning rate
  # Lr warmups from 0 to 0.01 during the first warmup epochs
  # then it is annealed using a cosine scheduler to final_lr
  'init_lr': 0.001,
  # Final learning rate
  'final_lr': 0.0002,
  # Momentum warmups from 0.8 to 0.937 during the first warmup epochs
  'warmup_momentum': 0.8,
  'momentum': 0.9,
  # Regularizer
  'weight_decay': 0.0005,
  # Balance coefficients for detection loss
  'balance': [0.4, 1, 4],  # Large , medium, small output grids
  # Exponential moving average parameters 
  # (https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage)
  'ema_decay': 0.9999,
  'ema_exp_d': 2000,
  'ema_mode': 'exp',

  # Eval
  # Minimum acceptance confidence threshold
  'confidence_thresh': 0.001,
  # Non-maximum suppression threshold
  'nms_thresh': 0.4,

  # Img
  # Sequence of augmentations to eventually apply to each image at training time
  'augments': ['delta_bright', 'hflip'],
  # Mosaic augmentation
  # Probability to build a mosaic instead of a single image
  'mosaic_prob': 1,
  # Parameters influencing the affine transform used to build the mosaic
  # See autoagents/models/yolo/data_wrapper
  'mosaic_delta': 0.1,
  'mosaic_scale': 0.5,
  'mosaic_translate': 0.1,

  # Target
  # These parameters define how a prediction grid is transformed
  # to retrieve the bbox predictions
  # Anchor priors, as defined in v5
  'anchor_priors': [
    [[10, 13], [16, 30], [33, 23]],
    [[30, 61], [62, 45], [59, 119]],
    [[116, 90], [156, 198], [373, 326]]
  ],
  # Downsampled size at each prediction grid
  # Step size at each "grid division". The v5 network structure downsamples
  # the input image by a factor of 32, which is the output of the first grid (13x13 in case of a 416x416 input)
  # the input image by a factor of 16, which is the output of the second grid (26x26) etc...
  'steps': [32, 16, 8],
  'mode': 'mult',
  'mult_thresh': [4, 4, 4],
  'bbox_fn': 'exp',

  # VOC names
  'cls_names': [
    'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
    'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
    'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
  ]
}

# Dump the parameters on a file, which we will provide for inference
import yaml
with open('./fine_tune_params.yaml', mode='w') as f:
  yaml.dump(params, f)

In [None]:
# Fine-tune training for 10 epochs
!python ./autoagent/models/yolo/train.py \
  --params ./fine_tune_params.yaml \
  --data voc --batch_size 64 --aggregate 1 \
  --ckpt ./yolov5weights/v5l.pt --fine_tune \
  --img_dim 416 --num_workers 4

# View training statistics
We can view the training statistics using tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./exp/