In [2]:
import shutil, os
from os import listdir
from os.path import isfile, join
import yaml
from glob import glob
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import seaborn as sns

In [3]:
DATA_DIR = f'./dataset_640'
NUM_FOLD = 5
classes = ['0. opacity']

In [4]:
train_df = pd.read_csv(f'./meta_all.csv')
train_df

Unnamed: 0,image_id,study_id,patient_id,rows,columns,boxes,label,id,fold
0,000a312787f2,5776db0cec75,dcc3fdf6426a,3488,4256,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,000a312787f2_image,2
1,000c3a3f293f,ff0879eb20ed,f09ff9b7dab3,2320,2832,,none 1 0 0 1 1,000c3a3f293f_image,4
2,0012ff7358bc,9d514ce429a7,6a02c0731039,2544,3056,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,0012ff7358bc_image,2
3,001398f4ff4f,28dddc8559b2,db63d9b7f61c,3520,4280,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,001398f4ff4f_image,0
4,001bd15d1891,dfd9fdd85a3e,605693f36f43,2800,3408,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,001bd15d1891_image,1
...,...,...,...,...,...,...,...,...,...
6329,ffcc6edd9445,7e6c68462e06,71b5edab804b,4240,3480,,none 1 0 0 1 1,ffcc6edd9445_image,0
6330,ffd91a2c4ca0,8332bdaddb6e,41397b2c9d10,2800,3408,,none 1 0 0 1 1,ffd91a2c4ca0_image,2
6331,ffd9b6cf2961,7eed9af03814,19ca51007544,2388,3050,"[{'x': 2197.38566, 'y': 841.07361, 'width': 31...",opacity 1 2197.38566 841.07361 2513.80265 1292...,ffd9b6cf2961_image,1
6332,ffdc682f7680,a0cb0b96fb3d,eb981a5bc177,3488,4256,"[{'x': 2729.27083, 'y': 332.26044, 'width': 14...",opacity 1 2729.27083 332.26044 4225.52099 2936...,ffdc682f7680_image,3


In [4]:
train_df['image_path'] = f'{DATA_DIR}/image/' + train_df.image_id + '_image.png'
train_df.head()

Unnamed: 0,image_id,study_id,patient_id,rows,columns,boxes,label,id,fold,image_path
0,000a312787f2,5776db0cec75,dcc3fdf6426a,3488,4256,"[{'x': 789.28836, 'y': 582.43035, 'width': 102...",opacity 1 789.28836 582.43035 1815.94498 2499....,000a312787f2_image,2,/home/youngkim21/kaggle-SIIM-covid-19/dataset_...
1,000c3a3f293f,ff0879eb20ed,f09ff9b7dab3,2320,2832,,none 1 0 0 1 1,000c3a3f293f_image,4,/home/youngkim21/kaggle-SIIM-covid-19/dataset_...
2,0012ff7358bc,9d514ce429a7,6a02c0731039,2544,3056,"[{'x': 677.42216, 'y': 197.97662, 'width': 867...",opacity 1 677.42216 197.97662 1545.21983 1197....,0012ff7358bc_image,2,/home/youngkim21/kaggle-SIIM-covid-19/dataset_...
3,001398f4ff4f,28dddc8559b2,db63d9b7f61c,3520,4280,"[{'x': 2729, 'y': 2181.33331, 'width': 948.000...",opacity 1 2729 2181.33331 3677.00012 2785.33331,001398f4ff4f_image,0,/home/youngkim21/kaggle-SIIM-covid-19/dataset_...
4,001bd15d1891,dfd9fdd85a3e,605693f36f43,2800,3408,"[{'x': 623.23328, 'y': 1050, 'width': 714, 'he...",opacity 1 623.23328 1050 1337.23328 2156 opaci...,001bd15d1891_image,1,/home/youngkim21/kaggle-SIIM-covid-19/dataset_...


In [5]:
train_df.image_path[0]

'/home/youngkim21/kaggle-SIIM-covid-19/dataset_640/image/000a312787f2_image.png'

In [6]:
train_df[['patient_id', 'fold']][(train_df.patient_id == '112e8c966f47')]

Unnamed: 0,patient_id,fold
58,112e8c966f47,0
116,112e8c966f47,0
342,112e8c966f47,0
474,112e8c966f47,0
1018,112e8c966f47,0
1391,112e8c966f47,0
1608,112e8c966f47,0
1957,112e8c966f47,0
2024,112e8c966f47,0
2073,112e8c966f47,0


In [8]:
TXT_DIR = f'{DATA_DIR}/yolo_txt/'

for fold in range(NUM_FOLD):
    train_files = []
    val_files = []
    val_files += list(train_df[train_df.fold==fold].image_path.unique())
    train_files += list(train_df[train_df.fold!=fold].image_path.unique())
    
    os.makedirs(f'{DATA_DIR}/fold/{fold}/labels/train', exist_ok=True)
    os.makedirs(f'{DATA_DIR}/fold/{fold}/labels/val', exist_ok=True)
    os.makedirs(f'{DATA_DIR}/fold/{fold}/images/train', exist_ok=True)
    os.makedirs(f'{DATA_DIR}/fold/{fold}/images/val', exist_ok=True)
    os.makedirs(TXT_DIR, exist_ok=True)
    label_dir = f'{DATA_DIR}/label_txt'

    for f in tqdm(train_files):
        shutil.copy(f, f'{DATA_DIR}/fold/{fold}/images/train')
        filename = f.split('/')[-1].split('.')[0]
        shutil.copy(os.path.join(label_dir, f'{filename}.txt'), f'{DATA_DIR}/fold/{fold}/labels/train')

    for f in tqdm(val_files):
        shutil.copy(f, f'{DATA_DIR}/fold/{fold}/images/val')
        filename = f.split('/')[-1].split('.')[0]
        shutil.copy(os.path.join(label_dir, f'{filename}.txt'), f'{DATA_DIR}/fold/{fold}/labels/val')


    with open(join(TXT_DIR, f'train{fold}.txt'), 'w') as f:
        for path in glob(f'{DATA_DIR}/fold/{fold}/images/train/*'):
            f.write(path+'\n')

    with open(join(TXT_DIR, f'val{fold}.txt'), 'w') as f:
        for path in glob(f'{DATA_DIR}/fold/{fold}/images/val/*'):
            f.write(path+'\n')

    data = dict(
        train = join(TXT_DIR, f'train{fold}.txt'),
        val = join(TXT_DIR, f'val{fold}.txt'),
        nc = 1, 
        names = classes
    )

    with open(f'./yolov5/data/siim-cov19-fold{fold}.yaml', 'w') as outfile:
        yaml.dump(data, outfile, default_flow_style=False)

    f = open(f'./yolov5/data/siim-cov19-fold{fold}.yaml', 'r')
    print(f.read())
    f.close()

  0%|          | 0/5067 [00:00<?, ?it/s]

  0%|          | 0/1267 [00:00<?, ?it/s]

names:
- 0. opacity
nc: 1
train: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/train0.txt
val: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/val0.txt



  0%|          | 0/5067 [00:00<?, ?it/s]

  0%|          | 0/1267 [00:00<?, ?it/s]

names:
- 0. opacity
nc: 1
train: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/train1.txt
val: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/val1.txt



  0%|          | 0/5067 [00:00<?, ?it/s]

  0%|          | 0/1267 [00:00<?, ?it/s]

names:
- 0. opacity
nc: 1
train: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/train2.txt
val: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/val2.txt



  0%|          | 0/5067 [00:00<?, ?it/s]

  0%|          | 0/1267 [00:00<?, ?it/s]

names:
- 0. opacity
nc: 1
train: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/train3.txt
val: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/val3.txt



  0%|          | 0/5068 [00:00<?, ?it/s]

  0%|          | 0/1266 [00:00<?, ?it/s]

names:
- 0. opacity
nc: 1
train: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/train4.txt
val: /home/youngkim21/kaggle-SIIM-covid-19/dataset_640/yolo_txt/val4.txt

