In [1]:
import ast
import os
import shutil

import numpy as np
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import clear_output
from sklearn.model_selection import StratifiedKFold, train_test_split
from tqdm import tqdm
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# !git clone https://github.com/ultralytics/yolov5  # clone repo
# clear_output()

In [3]:
os.chdir("/app/_data/yolov5")

In [4]:
!pip install -r requirements.txt
clear_output()

In [5]:
df = pd.read_csv("/app/_data/bboxes.csv")
labels = pd.read_csv('/app/_data/additional_dataset/labels_full.csv')

In [6]:
labels['Target'].value_counts()
labels[(labels['Target']==0)&(labels['x_center']).isna()].shape

0    20672
1     9555
Name: Target, dtype: int64

(20672, 17)

In [7]:
labels['img'] = labels['patientId']
labels['bbox'] = labels['Target']
labels['width'] = labels['w']
labels['height'] = labels['h']
labels['label'] = labels['bbox'].replace({0:'negative', 1:'positive'})
labels = labels[df.columns[1:]]
labels.head()

Unnamed: 0,img,label,bbox,x_center,y_center,width,height,class
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,negative,0,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,negative,0,,,,,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,negative,0,,,,,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,negative,0,,,,,0
4,00436515-870c-4b36-a041-de91049b9ab4,positive,1,0.361816,0.333496,0.208008,0.370117,0


In [8]:
df = pd.concat([df[df.columns[1:]], labels], axis=0, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

In [9]:
df['path'] = '/app/_data/yolo5_dataset/'+df['img']+'.jpg'
df

Unnamed: 0,img,label,bbox,x_center,y_center,width,height,class,path
0,c73c6a49-00ca-4862-8efa-785e3d3a39f7,negative,0,,,,,0,/app/_data/yolo5_dataset/c73c6a49-00ca-4862-8e...
1,40b011e6-7d68-482e-b8b0-664a61ebe21d,positive,1,0.735840,0.562500,0.245117,0.412109,0,/app/_data/yolo5_dataset/40b011e6-7d68-482e-b8...
2,220d7304a493,negative,0,,,,,0,/app/_data/yolo5_dataset/220d7304a493.jpg
3,40820805-1e1f-439a-b287-11a50a4d2b19,negative,0,,,,,0,/app/_data/yolo5_dataset/40820805-1e1f-439a-b2...
4,524d06460f5f,typical,1,0.879364,0.541744,0.179419,0.411874,0,/app/_data/yolo5_dataset/524d06460f5f.jpg
...,...,...,...,...,...,...,...,...,...
39811,a5c5e8425f03,typical,1,0.656480,0.469235,0.278839,0.695406,0,/app/_data/yolo5_dataset/a5c5e8425f03.jpg
39812,180bbb1f-4105-4fa1-94cf-5bb8294eaae8,positive,1,0.672852,0.579102,0.267578,0.328125,0,/app/_data/yolo5_dataset/180bbb1f-4105-4fa1-94...
39813,fcb9ac7c-e587-48e2-932d-ce270d6d66b8,positive,1,0.379883,0.545410,0.160156,0.129883,0,/app/_data/yolo5_dataset/fcb9ac7c-e587-48e2-93...
39814,149c73c79507,atypical,1,0.712205,0.385185,0.102796,0.271562,0,/app/_data/yolo5_dataset/149c73c79507.jpg


In [10]:
base_config = {
    "IMG_HEIGH": 640,
    "IMG_WIDTH": 640,
    "BATCH_SIZE": 20,
    "SEED": 42,
}

In [11]:
groupped_data =pd.pivot_table(df,index='img', values=['label', 'bbox', 'path'],aggfunc='first')
groupped_data['img'] = groupped_data.index
groupped_data = groupped_data.reset_index(drop=True)

In [12]:
groupped_data.head()
groupped_data['bbox'].value_counts()

Unnamed: 0,bbox,label,path,img
0,0,negative,/app/_data/yolo5_dataset/0004cfab-14fd-4e49-80...,0004cfab-14fd-4e49-80ba-63a80b6bddd6
1,0,negative,/app/_data/yolo5_dataset/000924cf-0f8d-42bd-91...,000924cf-0f8d-42bd-9158-1af53881a557
2,1,typical,/app/_data/yolo5_dataset/000a312787f2.jpg,000a312787f2
3,0,negative,/app/_data/yolo5_dataset/000c3a3f293f.jpg,000c3a3f293f
4,1,positive,/app/_data/yolo5_dataset/000db696-cf54-4385-b1...,000db696-cf54-4385-b10b-6b16fbb3f985
...,...,...,...,...
32709,1,positive,/app/_data/yolo5_dataset/fffb2395-8edd-4954-8a...,fffb2395-8edd-4954-8a89-ffe2fd329be3
32710,0,negative,/app/_data/yolo5_dataset/fffba05a-1635-4545-9b...,fffba05a-1635-4545-9bbd-57ad4cfe8d27
32711,0,negative,/app/_data/yolo5_dataset/fffc95b5-605b-4226-80...,fffc95b5-605b-4226-80ab-62caec682b22
32712,0,negative,/app/_data/yolo5_dataset/fffcff11-d018-4414-97...,fffcff11-d018-4414-971a-a7cefa327795


In [14]:
skf = StratifiedKFold(n_splits=5, random_state=base_config["SEED"], shuffle=True)
train_ids = []
val_ids = []
for train_index, valid_index in skf.split(groupped_data, groupped_data['label']):
    train_ids.append(train_index)
    val_ids.append(valid_index)

In [16]:
for i in range(5):
    train_img_path = groupped_data.loc[train_ids[i]]['path'].tolist()
    val_img_path = groupped_data.loc[val_ids[i]]['path'].tolist()
    np.savetxt("/app/_data/yolo5_dataset/train_"+str(i)+".txt", train_img_path, fmt='%s')
    np.savetxt("/app/_data/yolo5_dataset/val_"+str(i)+".txt", val_img_path, fmt='%s')


In [17]:
for img_name in tqdm(df['img'].unique().tolist()):
    new_df = df[df['img']==img_name].reset_index(drop=True)
    shutil.copy(
        "/app/_data/jpg/" + img_name + ".jpg",
        os.path.join("/app/_data/yolo5_dataset/", img_name + ".jpg"),
    )
    if new_df.loc[0,'bbox']==1:
        boxes = new_df[['class', 'x_center','y_center','width','height']].values
        list_boxes = []
        for box in boxes:
            list_boxes.append(
                [
                    str(box[0]),
                    str(box[1]),
                    str(box[2]),
                    str(box[3]),
                    str(box[4]),
                ]
            )
        np.savetxt("/app/_data/yolo5_dataset/"+img_name+".txt", list_boxes, fmt='%s')
clear_output()

In [19]:
# customize iPython writefile so we can write variables

from IPython.core.magic import register_line_cell_magic


@register_line_cell_magic
def writetemplate(line, cell):
    with open(line, "w") as f:
        f.write(cell.format(**globals()))

In [20]:
if not os.path.exists("/app/_data/yolo5_dataset/DataFile"):
    os.mkdir("/app/_data/yolo5_dataset/DataFile")

In [21]:
%%writetemplate /app/_data/yolo5_dataset/DataFile/data.yaml
# train and val data
train: /app/_data/yolo5_dataset/train_0.txt
val: /app/_data/yolo5_dataset/val_0.txt
# number of classes
nc: 1
# class names
names: ["opacity"]

In [22]:
%%writetemplate /app/_data/yolo5_dataset/DataFile/customYOLOv5x6.yaml
# parameters
nc: 1  # number of classes
depth_multiple: 1.33  # model depth multiple
width_multiple: 1.25  # layer channel multiple

# anchors
anchors:
  - [ 19,27,  44,40,  38,94 ]  # P3/8
  - [ 96,68,  86,152,  180,137 ]  # P4/16
  - [ 140,301,  303,264,  238,542 ]  # P5/32
  - [ 436,615,  739,380,  925,792 ]  # P6/64

# YOLOv5 backbone
backbone:
  # [from, number, module, args]
  [ [ -1, 1, Focus, [ 64, 3 ] ],  # 0-P1/2
    [ -1, 1, Conv, [ 128, 3, 2 ] ],  # 1-P2/4
    [ -1, 3, C3, [ 128 ] ],
    [ -1, 1, Conv, [ 256, 3, 2 ] ],  # 3-P3/8
    [ -1, 9, C3, [ 256 ] ],
    [ -1, 1, Conv, [ 512, 3, 2 ] ],  # 5-P4/16
    [ -1, 9, C3, [ 512 ] ],
    [ -1, 1, Conv, [ 768, 3, 2 ] ],  # 7-P5/32
    [ -1, 3, C3, [ 768 ] ],
    [ -1, 1, Conv, [ 1024, 3, 2 ] ],  # 9-P6/64
    [ -1, 1, SPP, [ 1024, [ 3, 5, 7 ] ] ],
    [ -1, 3, C3, [ 1024, False ] ],  # 11
  ]

# YOLOv5 head
head:
  [ [ -1, 1, Conv, [ 768, 1, 1 ] ],
    [ -1, 1, nn.Upsample, [ None, 2, 'nearest' ] ],
    [ [ -1, 8 ], 1, Concat, [ 1 ] ],  # cat backbone P5
    [ -1, 3, C3, [ 768, False ] ],  # 15

    [ -1, 1, Conv, [ 512, 1, 1 ] ],
    [ -1, 1, nn.Upsample, [ None, 2, 'nearest' ] ],
    [ [ -1, 6 ], 1, Concat, [ 1 ] ],  # cat backbone P4
    [ -1, 3, C3, [ 512, False ] ],  # 19

    [ -1, 1, Conv, [ 256, 1, 1 ] ],
    [ -1, 1, nn.Upsample, [ None, 2, 'nearest' ] ],
    [ [ -1, 4 ], 1, Concat, [ 1 ] ],  # cat backbone P3
    [ -1, 3, C3, [ 256, False ] ],  # 23 (P3/8-small)

    [ -1, 1, Conv, [ 256, 3, 2 ] ],
    [ [ -1, 20 ], 1, Concat, [ 1 ] ],  # cat head P4
    [ -1, 3, C3, [ 512, False ] ],  # 26 (P4/16-medium)

    [ -1, 1, Conv, [ 512, 3, 2 ] ],
    [ [ -1, 16 ], 1, Concat, [ 1 ] ],  # cat head P5
    [ -1, 3, C3, [ 768, False ] ],  # 29 (P5/32-large)

    [ -1, 1, Conv, [ 768, 3, 2 ] ],
    [ [ -1, 12 ], 1, Concat, [ 1 ] ],  # cat head P6
    [ -1, 3, C3, [ 1024, False ] ],  # 32 (P6/64-xlarge)

    [ [ 23, 26, 29, 32 ], 1, Detect, [ nc, anchors ] ],  # Detect(P3, P4, P5, P6)
  ]


In [24]:
%%time
!python train.py --img 1024 \
                 --batch 10 \
                 --epochs 50 \
                 --data /app/_data/yolo5_dataset/DataFile/data.yaml \
                 --cfg /app/_data/yolo5_dataset/DataFile/customYOLOv5x6.yaml \
                 --weights /app/_data/yolov5/runs/train/yolov5x6_voi_0/weights/last.pt  \
                 --name yolov5x6_voi_0 \
                 --cache

[34m[1mgithub: [0mskipping check (Docker image), for updates see https://github.com/ultralytics/yolov5
YOLOv5 🚀 v5.0-150-gabb2a96 torch 1.7.1+cu110 CUDA:0 (NVIDIA GeForce RTX 3090, 24268.3125MB)

Namespace(adam=False, artifact_alias='latest', batch_size=10, bbox_interval=-1, bucket='', cache_images=True, cfg='/app/_data/yolo5_dataset/DataFile/customYOLOv5x6.yaml', data='/app/_data/yolo5_dataset/DataFile/data.yaml', device='', entity=None, epochs=50, evolve=False, exist_ok=False, global_rank=-1, hyp='data/hyp.scratch.yaml', image_weights=False, img_size=[1024, 1024], label_smoothing=0.0, linear_lr=False, local_rank=-1, multi_scale=False, name='yolov5x6_voi_0', noautoanchor=False, nosave=False, notest=False, project='runs/train', quad=False, rect=False, resume=False, save_dir='runs/train/yolov5x6_voi_02', save_period=-1, single_cls=False, sync_bn=False, total_batch_size=10, upload_dataset=False, weights='/app/_data/yolov5/runs/train/yolov5x6_voi_0/weights/last.pt', workers=8, world_si