# Setup

Clone GitHub [repository](https://github.com/ultralytics/yolov5), install [dependencies](https://github.com/ultralytics/yolov5/blob/master/requirements.txt) and check PyTorch and GPU.

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# !git clone https://github.com/ultralytics/yolov5  # clone
# %cd yolov5
# %pip install -qr requirements.txt comet_ml  # install

import torch
import utils
display = utils.notebook_init()  # checks

YOLOv5 ðŸš€ v7.0-398-g5cdad892 Python-3.11.11 torch-2.5.1+cu124 CPU


Setup complete âœ… (2 CPUs, 12.7 GB RAM, 35.7/107.7 GB disk)


# 1. Detect

`detect.py` runs YOLOv5 inference on a variety of sources, downloading models automatically from the [latest YOLOv5 release](https://github.com/ultralytics/yolov5/releases), and saving results to `runs/detect`.
```

In [None]:
!python detect.py --weights yolov5x6.pt --img 1280 --conf 0.25 --source ../vdo.avi --save-txt --class 2 --save-conf
# display.Image(filename='runs/detect/exp/zidane.jpg', width=600)

[34m[1mdetect: [0mweights=['yolov3.pt'], source=../vdo.avi, data=data/coco128.yaml, imgsz=[1280, 1280], conf_thres=0.25, iou_thres=0.45, max_det=1000, device=, view_img=False, save_txt=True, save_format=0, save_csv=False, save_conf=True, save_crop=False, nosave=False, classes=[2], agnostic_nms=False, augment=False, visualize=False, update=False, project=runs/detect, name=exp, exist_ok=False, line_thickness=3, hide_labels=False, hide_conf=False, half=False, dnn=False, vid_stride=1
YOLOv5 ðŸš€ v7.0-398-g5cdad892 Python-3.11.11 torch-2.5.1+cu124 CUDA:0 (Tesla T4, 15095MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov3.pt to yolov3.pt...
100% 119M/119M [00:01<00:00, 123MB/s]

Fusing layers... 
yolov3 summary: 261 layers, 61922845 parameters, 0 gradients, 155.9 GFLOPs
video 1/1 (1/2141) /content/vdo.avi: 736x1280 7 cars, 82.5ms
video 1/1 (2/2141) /content/vdo.avi: 736x1280 7 cars, 73.1ms
video 1/1 (3/2141) /content/vdo.avi: 736x1280 7 cars, 72.1ms
video

#2. Evaluation

In [51]:
from utils import *
# Generate dictionary from the predictions
image_width, image_height = 1920, 1080
folder_path = "/run/detect/exp2/labels"

bboxes_dict_pred = read_bboxes_from_folder(folder_path, image_width, image_height)
print(bboxes_dict_pred)
len(bboxes_dict_pred)

{1219: [(1285, 356, 1518, 548), (921, 169, 1087, 304), (567, 95, 661, 169), (1179, 101, 1229, 166), (911, 93, 943, 142), (928, 78, 1013, 146), (579, 75, 663, 118), (874, 95, 920, 142)], 457: [(1284, 356, 1516, 548), (1310, 146, 1552, 265), (682, 121, 795, 206), (566, 94, 660, 168), (619, 49, 697, 84), (683, 49, 747, 81), (704, 81, 785, 131), (926, 79, 1015, 146), (1178, 101, 1228, 162), (577, 73, 656, 107), (910, 92, 942, 140), (878, 94, 924, 140)], 508: [(1282, 358, 1518, 547), (1309, 146, 1553, 266), (640, 237, 813, 378), (675, 127, 795, 212), (565, 94, 661, 169), (926, 79, 1014, 146), (1178, 101, 1228, 161), (577, 75, 660, 110), (910, 92, 942, 141), (874, 94, 920, 141)], 309: [(1285, 356, 1518, 547), (862, 105, 956, 178), (565, 95, 660, 169), (937, 79, 1014, 146), (1178, 102, 1228, 157), (581, 74, 658, 109), (561, 55, 609, 96), (900, 92, 942, 108)], 500: [(1284, 355, 1516, 548), (1309, 146, 1553, 265), (648, 211, 808, 341), (688, 113, 799, 193), (566, 93, 660, 168), (924, 78, 1014, 

2141

In [52]:
# Generate dictonary form groundtruth
gt_file_path = 'annotations_xml.txt'

gt_dict = read_gt_and_create_dict(gt_file_path)

print(gt_dict)
len(gt_dict)

{1: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 2: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 3: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 4: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 5: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 6: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 

2141

In [53]:
#Sort the dictionaries
gt_dict = dict(sorted(gt_dict.items()))
bboxes_dict_pred= dict(sorted(bboxes_dict_pred.items()))
print(gt_dict)
print(bboxes_dict_pred)

{1: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 2: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 3: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 4: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 5: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 927, 145), (1176, 82, 1253, 174), (1285, 363, 1516, 546), (931, 78, 1013, 146)], 6: [(558, 94, 663, 169), (573, 72, 661, 145), (913, 93, 972, 144), (894, 95, 944, 144), (878, 107, 

In [55]:
image_width, image_height = 1920, 1080
all_aps = []
start_idx=0
end_idx=500
aps_tanda = process_tanda(start_idx, end_idx, gt_dict, bboxes_dict_pred, image_width, image_height)
all_aps.extend(aps_tanda)

Processing frames 0-500: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 500/500 [03:48<00:00,  2.19it/s]


In [56]:
start_idx=500
end_idx=1050
aps_tanda = process_tanda(start_idx, end_idx, gt_dict, bboxes_dict_pred, image_width, image_height)
all_aps.extend(aps_tanda)  # Guardar los APs calculados

Processing frames 500-1050: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 550/550 [03:40<00:00,  2.50it/s]


In [57]:
start_idx=1050
end_idx=1600
aps_tanda = process_tanda(start_idx, end_idx, gt_dict, bboxes_dict_pred, image_width, image_height)
all_aps.extend(aps_tanda)  # Guardar los APs calculados

Processing frames 1050-1600: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 550/550 [03:50<00:00,  2.39it/s]


In [58]:
start_idx=1600
end_idx=2140
aps_tanda = process_tanda(start_idx, end_idx, gt_dict, bboxes_dict_pred, image_width, image_height)
all_aps.extend(aps_tanda)  # Guardar los APs calculados

Processing frames 1600-2140: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 540/540 [03:52<00:00,  2.33it/s]


In [59]:
# Calcular mAP final
mAP = np.mean(all_aps) if all_aps else 0
print(f"mAP final = {mAP}")

mAP final = 0.5640793123142316


### Results mAP in different yolo models with conf 0.4
pred3   mAP50=0.5734 mAP70=0.4954

pred4x5 mAP=0.564    mAP70=0.4435



mAP final = 0.5860682381434293 for pred5x6
mAP final = 0.6147277227162529 for pred3

# 3. Fine-tune
Fine-tuning of yolov5x6.pt model freezing the first 10 layers of the structure, so it can be more accurate for our data.

In [None]:
# Train YOLOv5s on COCO128 for 3 epochs
!python train.py --img 1280 --batch 4 --epochs 30 --data dataset.yaml --weights yolov5x6.pt  --freeze 10

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with torch.cuda.amp.autocast(amp):
  with tor