In [1]:
from ultralytics import YOLO
import torch

In [3]:
# make a subset of the dataset with 100 images and their labels
import os
import random
import shutil

def create_subset(input_images_dir, input_labels_dir, output_images_dir, output_labels_dir, subset_size):
    # Create output directories if they don't exist
    os.makedirs(output_images_dir, exist_ok=True)
    os.makedirs(output_labels_dir, exist_ok=True)

    # List all images in the input directory
    images = os.listdir(input_images_dir)

    # Choose a random subset of images
    subset_images = random.sample(images, subset_size)

    # Copy selected images and their corresponding labels to the output directory
    for image in subset_images:
        # Copy image
        shutil.copy(os.path.join(input_images_dir, image), output_images_dir)
        
        # Corresponding label file (assuming label filename is same as image filename with a different extension)
        label_file = os.path.splitext(image)[0] + '.txt'
        shutil.copy(os.path.join(input_labels_dir, label_file), output_labels_dir)

# Example usage
input_images_dir = 'datasets/images/train'
input_labels_dir = 'datasets/labels/train'
output_images_dir = 'datasets/images/train_subset'
output_labels_dir = 'datasets/labels/train_subset'
subset_size = 100

create_subset(input_images_dir, input_labels_dir, output_images_dir, output_labels_dir, subset_size)


In [2]:
model1 = YOLO('yolov8n.yaml')       # Initialize model from scratch
model2 = YOLO('yolov8n.pt')         # Initialize model from existing weights
model3 = YOLO('yolov8m.pt')         # Initialize model from existing weights but medium size

In [None]:
results1 = model1.train(data='./datasets/config2.yaml', epochs=20, imgsz=640)
# results2 = model1.train(data='./datasets/config.yaml', epochs=20, imgsz=640)
# results3 = model2.train(data='./datasets/config2.yaml', epochs=20, imgsz=640)
# results4 = model2.train(data='./datasets/config.yaml', epochs=20, imgsz=640)
# results5 = model3.train(data='./datasets/config2.yaml', epochs=20, imgsz=640)
# results6 = model3.train(data='./datasets/config.yaml', epochs=20, imgsz=640)

In [6]:
# Function to count parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters())

# Function to count convolutional layers
def count_conv_layers(model):
    return sum(1 for m in model.modules() if isinstance(m, torch.nn.Conv2d))

# Calculate parameters and convolutional layers
num_parameters = count_parameters(model3)
num_conv_layers = count_conv_layers(model3)

print(f"Number of parameters in YOLOv8M: {num_parameters}")
print(f"Number of convolutional layers in YOLOv8M: {num_conv_layers}")

Number of parameters in YOLOv8M: 25902640
Number of convolutional layers in YOLOv8M: 84


In [3]:
augmentations = {
    'hsv_h': 0.0,
    'hsv_s': 0.0,
    'hsv_v': 0.0,
    'degrees': 0.0,
    'translate': 0.0,
    'scale': 0.0,
    'shear': 0.0,
    'perspective': 0.0,
    'flipud': 0.0,
    'fliplr': 0.0,
    'mosaic': 0.0,
    'mixup': 0.0,
    'copy_paste': 0.0,
    'erasing': 0.0,
}
no_aug_results = model2.train(data='./datasets/config2.yaml', epochs=20, imgsz=640, close_mosaic=20, **augmentations)

Ultralytics YOLOv8.1.46 🚀 Python-3.10.12 torch-1.13.1+cu117 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 5938MiB)


[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=./datasets/config2.yaml, epochs=20, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train28, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=20, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=True, line_width=None, format=torchscript, keras=False, optimize=Fals

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmulukutla-p[0m ([33mmkp[0m). Use [1m`wandb login --relogin`[0m to force relogin


Freezing layer 'model.22.dfl.conv.weight'
[34m[1mAMP: [0mrunning Automatic Mixed Precision (AMP) checks with YOLOv8n...
[34m[1mAMP: [0mchecks passed ✅


[34m[1mtrain: [0mScanning /home/prani/academics/fourth_year_sem2/cv/assi3/datasets/labels/train_subset.cache... 381 images, 0 backgrounds, 0 corrupt: 100%|██████████| 381/381 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /home/prani/academics/fourth_year_sem2/cv/assi3/datasets/labels/val.cache... 50 images, 0 backgrounds, 0 corrupt: 100%|██████████| 50/50 [00:00<?, ?it/s]


Plotting labels to runs/detect/train28/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mruns/detect/train28[0m
Starting training for 20 epochs...
Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/20      2.13G      1.139      3.361      1.337         13        640: 100%|██████████| 24/24 [00:03<00:00,  6.04it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00,  6.83it/s]

                   all         50         52     0.0034      0.981      0.577      0.382






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/20      2.11G      1.056      2.858      1.288         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.35it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 10.12it/s]


                   all         50         52      0.727     0.0524      0.416      0.188

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/20      2.09G      1.059      2.678      1.289         14        640: 100%|██████████| 24/24 [00:02<00:00,  9.55it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 10.62it/s]

                   all         50         52       0.54      0.271      0.377      0.163






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/20      2.09G      1.059      2.392      1.292         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.54it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.18it/s]

                   all         50         52      0.292      0.327      0.224       0.12






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/20      2.09G     0.9623      2.136      1.202         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.05it/s]


                   all         50         52      0.527      0.519      0.444      0.222

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/20      2.09G     0.8659       1.78      1.118         14        640: 100%|██████████| 24/24 [00:02<00:00,  9.78it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.38it/s]

                   all         50         52      0.654      0.481       0.52      0.306






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/20      2.09G     0.7692      1.451      1.058         14        640: 100%|██████████| 24/24 [00:02<00:00,  9.83it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.77it/s]

                   all         50         52       0.53        0.5       0.49      0.313






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/20      2.09G     0.6645      1.212     0.9906         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.56it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 12.47it/s]

                   all         50         52       0.63      0.558      0.561      0.357






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/20      2.09G     0.6054      1.031     0.9584         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.53it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 12.98it/s]

                   all         50         52      0.505      0.327      0.412      0.251






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/20      2.09G     0.5569     0.9554     0.9268         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.90it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.55it/s]

                   all         50         52       0.76      0.481      0.594      0.399






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/20      2.09G     0.4766     0.7744     0.8897         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.74it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.10it/s]

                   all         50         52      0.696      0.538      0.609      0.419






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/20      2.09G     0.4405     0.6951     0.8741         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.52it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.36it/s]

                   all         50         52      0.664      0.596      0.568      0.392






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/20      2.09G     0.4113     0.6481     0.8599         15        640: 100%|██████████| 24/24 [00:02<00:00,  9.81it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 12.16it/s]

                   all         50         52      0.749      0.596      0.624      0.442






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/20      2.09G     0.3728     0.5703     0.8458         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.61it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.39it/s]

                   all         50         52      0.667      0.654      0.668      0.449






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/20      2.09G     0.3387     0.5048     0.8356         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.71it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.85it/s]

                   all         50         52      0.776      0.596      0.636      0.471






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      16/20      2.09G     0.3104      0.465     0.8253         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.89it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 10.15it/s]


                   all         50         52      0.733      0.577      0.662      0.476

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      17/20      2.09G     0.2752     0.4225     0.8154         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 12.46it/s]

                   all         50         52      0.826      0.638       0.69      0.513






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      18/20      2.09G     0.2361     0.3729     0.8065         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.66it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 12.11it/s]

                   all         50         52      0.808      0.647      0.693      0.512






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      19/20      2.09G     0.2025     0.3417     0.7993         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.74it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 12.14it/s]

                   all         50         52       0.79       0.65      0.693      0.511






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      20/20      2.09G     0.1727     0.3167      0.795         13        640: 100%|██████████| 24/24 [00:02<00:00,  9.68it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00, 11.76it/s]

                   all         50         52       0.79      0.654      0.685      0.507






20 epochs completed in 0.018 hours.
Optimizer stripped from runs/detect/train28/weights/last.pt, 6.2MB
Optimizer stripped from runs/detect/train28/weights/best.pt, 6.2MB

Validating runs/detect/train28/weights/best.pt...
Ultralytics YOLOv8.1.46 🚀 Python-3.10.12 torch-1.13.1+cu117 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 5938MiB)
Model summary (fused): 168 layers, 3005843 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 2/2 [00:00<00:00,  5.69it/s]


                   all         50         52      0.822      0.635       0.69      0.512
Speed: 1.1ms preprocess, 1.4ms inference, 0.0ms loss, 0.6ms postprocess per image
Results saved to [1mruns/detect/train28[0m


0,1
lr/pg0,▃▅▆██▇▇▇▆▆▅▅▄▄▃▃▂▂▁▁
lr/pg1,▃▅▆██▇▇▇▆▆▅▅▄▄▃▃▂▂▁▁
lr/pg2,▃▅▆██▇▇▇▆▆▅▅▄▄▃▃▂▂▁▁
metrics/mAP50(B),▆▄▃▁▄▅▅▆▄▇▇▆▇█▇█████
metrics/mAP50-95(B),▆▂▂▁▃▄▄▅▃▆▆▆▇▇▇▇████
metrics/precision(B),▁▇▆▃▅▇▅▆▅▇▇▇▇▇█▇████
metrics/recall(B),█▁▃▃▅▄▄▅▃▄▅▅▅▆▅▅▅▅▆▅
model/GFLOPs,▁
model/parameters,▁
model/speed_PyTorch(ms),▁

0,1
lr/pg0,0.00012
lr/pg1,0.00012
lr/pg2,0.00012
metrics/mAP50(B),0.69004
metrics/mAP50-95(B),0.51154
metrics/precision(B),0.82155
metrics/recall(B),0.63462
model/GFLOPs,8.194
model/parameters,3011043.0
model/speed_PyTorch(ms),2.6


## 2.1
The dataset is divided into train and val partitions in images and labels directories. The corresponding images and labels can be identified by the same name given to them. The images are in .jpg format and the labels are in .txt format. 

## 2.2
YOLO (You Only Look Once) is a real-time object detection algorithm. It differs fundamentally from R-CNNs and similar algorithms as it requires only a single pass of the image to make the detection, unlike the other algorithms which require multiple passes. The other algorithms first consider proposal regions. But YOLO has convolutional layers followed by fully connected layers. 

In YOLO, a single CNN is used to directly predict the class labels and bounding boxes of objects within an image. These models are trained end-to-end using a large dataset of labeled images and their associated object-bounding boxes. They are therefore much faster than R-CNNs and its improvements. Also, YOLO are often pretrained on ImageNet or other datasets, so their performance is often better than expected.

YOLO
The image is divided into grids, and each grid cell predicts B bounding boxes and confidence scores for those boxes. YOLO models use non-maximum suppression (NMS). NMS is used to identify and remove redundant or incorrect bounding boxes and to output a single bounding box for each object in the image. This gets rid of overlapping bounding boxes for the same object.

YOLOv4
The main improvement in YOLO v4 over previous versions is the use of a new CNN architecture called CSPNet. It is a variant of the ResNet architecture for detection. YOLO v4 uses k-means clustering to generate anchor boxes. This groups the ground truth bounding boxes into clusters and then using the centroids of the clusters as the anchor boxes. This allows the anchor boxes to be more closely aligned with the detected objects' size and shape.

YOLOv7
A key improvement in YOLO v7 is the use of a new loss function called focal loss. Previous versions of YOLO used a standard cross-entropy loss function, which is known to be less effective at detecting small objects. Focal loss down-weighs the loss for well-classified examples and focuses on the hard examples. This helps the model to focus on the difficult examples and improve its performance on small objects. YOLO v7 also has a higher resolution than the previous versions. It processes images at a resolution of 608 by 608 pixels, which is higher than the 416 by 416 resolution used in previous versions. This allows the model to detect smaller objects more accurately.

## 2.3
YOLOv8n : 3.2 million parameters and 64 conv layers

YOLOv8m : 25.9 million parameters and 84 conv layers

## 2.4
[Link to the Report containing the results](https://api.wandb.ai/links/mkp/16f9hq8k)

(iii) The increase in dataset size mattered a lot more for the untrained model, whereas there was not much difference in the pretrained models. So, increased training time without a significant increase in map. Also, the YOLOv8m did not perform better than the nano version. I believe this is because the dataset is not large enough to benefit from the increased complexity of the model. If we had a larger and more varied dataset, we definitely would have seen better performance from the larger model.

(iv) The comparison is done in the report linked above. I observe that the larger model tends to have multiple boxes on the same duck sometimes, compared to the smaller one. This is because the larger model is more complex and can detect more features, but it is not always beneficial. The smaller model is more consistent in its predictions and is more likely to predict the same box for the same duck across different epochs.

## 2.5
(i) Yes, the default run uses augmentations. The augmentations are listed [here](https://docs.ultralytics.com/modes/train/#augmentation-settings-and-hyperparameters). The augmentations are used to increase the diversity of the dataset and help the model generalize better. For example, scale parameter scales the image by a gain factor, simulating objects at different distances from the camera.

(ii) [Link to the Report containing the results](https://api.wandb.ai/links/mkp/16f9hq8k). This contains the comparison of the model I trained with minimal augmentations. I did this on the best performing model from the previous section. The mAP50 decreased from 0.78 to 0.67 due to the lack of augmentations. This shows that augmentations are important for the model to generalize better and perform well on unseen data. The change can be said to be fairly significant. Because without augmentations, the model is not able to learn the features of the objects well enough to generalize to unseen data.

(iii) I think hues and translate are two important augmentations. I arrived at this conclusion by trial and error. When I set the hues and a few other parameters to 0, I saw a drop in mAP, which remained same even when I changed some other augmentations. This shows that hues are an important augmentation. Also, Translate aids in learning to detect partially visible objects by translating the image horizontally and vertically by a fraction of the image size. Mosaic is also important because it combines four images into one, which increases the diversity of the dataset and helps the model generalize better.