In [1]:
!pip install ultralytics;

Collecting ultralytics
  Downloading ultralytics-8.1.11-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting thop>=0.1.1 (from ultralytics)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Downloading ultralytics-8.1.11-py3-none-any.whl (709 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.5/709.5 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: thop, ultralytics
Successfully installed thop-0.1.1.post2209072238 ultralytics-8.1.11


In [2]:
import numpy as np
import pandas as pd
import os
import sys
import math
import random
import shutil

import cv2
import pydicom
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from ultralytics import YOLO

np.random.seed(42)

In [3]:
# shutil.rmtree('/kaggle/working/')

In [4]:
CSV_FILE = '../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv'
TRAIN_SRC_DIR = '../input/rsna-pneumonia-detection-challenge/stage_2_train_images/'
TEST_SRC_DIR = '../input/rsna-pneumonia-detection-challenge/stage_2_test_images/'
DATASET_DIR = './dataset/'
TEST_IMG_DIR = 'test_images/'

os.mkdir(DATASET_DIR)
os.mkdir(DATASET_DIR + 'images/')
os.mkdir(DATASET_DIR + 'images/train/')
os.mkdir(DATASET_DIR + 'images/val/')
os.mkdir(DATASET_DIR + 'images/test/')
os.mkdir(DATASET_DIR + 'labels/')
os.mkdir(DATASET_DIR + 'labels/train/')
os.mkdir(DATASET_DIR + 'labels/val/')
os.mkdir(DATASET_DIR + 'labels/test/')
os.mkdir(TEST_IMG_DIR)

# Prepare images and labels for training

In [5]:
annotations = pd.read_csv(CSV_FILE)
print(annotations.info())
annotations.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30227 entries, 0 to 30226
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   patientId  30227 non-null  object 
 1   x          9555 non-null   float64
 2   y          9555 non-null   float64
 3   width      9555 non-null   float64
 4   height     9555 non-null   float64
 5   Target     30227 non-null  int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 1.4+ MB
None


Unnamed: 0,patientId,x,y,width,height,Target
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,,,,,0
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,,,,,0
2,00322d4d-1c29-4943-afc9-b6754be640eb,,,,,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,,,,,0
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1


In [6]:
annotations.Target.value_counts(dropna=False)

Target
0    20672
1     9555
Name: count, dtype: int64

In [7]:
positive_annotations = annotations[annotations.Target == 1]
negative_annotations = annotations[annotations.Target == 0]

print('total  pneumo: ', positive_annotations['patientId'].shape[0])
print('unique pneumo: ', positive_annotations['patientId'].drop_duplicates().shape[0])
print('unique healthy:', negative_annotations['patientId'].shape[0])
assert negative_annotations['patientId'].shape[0] == negative_annotations['patientId'].drop_duplicates().shape[0]

negative_sample = negative_annotations.sample(600, random_state=42)
negative_sample['patientId'].shape[0]

annotations = pd.concat([positive_annotations, negative_sample])
print('\nupdated set:', annotations.shape)
print(annotations.Target.value_counts())
annotations.head()

total  pneumo:  9555
unique pneumo:  6012
unique healthy: 20672

updated set: (10155, 6)
Target
1    9555
0     600
Name: count, dtype: int64


Unnamed: 0,patientId,x,y,width,height,Target
4,00436515-870c-4b36-a041-de91049b9ab4,264.0,152.0,213.0,379.0,1
5,00436515-870c-4b36-a041-de91049b9ab4,562.0,152.0,256.0,453.0,1
8,00704310-78a8-4b38-8475-49f4573b2dbb,323.0,577.0,160.0,104.0,1
9,00704310-78a8-4b38-8475-49f4573b2dbb,695.0,575.0,162.0,137.0,1
14,00aecb01-a116-45a2-956c-08d2fa55433f,288.0,322.0,94.0,135.0,1


In [8]:
annotations.groupby(by='patientId')['x'].count().value_counts().sort_index() # number of boxes per img

x
0     600
1    2614
2    3266
3     119
4      13
Name: count, dtype: int64

In [9]:
patient_id_target = annotations[['patientId', 'Target']].drop_duplicates().set_index('patientId')
patient_id_target;

In [10]:
patient_id_series = annotations.patientId.drop_duplicates()
print('Number of images:', patient_id_series.size)

train_series, val_test_split = train_test_split(patient_id_series, test_size=0.2, random_state=42, 
                                                stratify = patient_id_target.loc[patient_id_series].Target) # 80/20
val_series, test_series = train_test_split(val_test_split, test_size=0.5, random_state=42,
                                          stratify = patient_id_target.loc[val_test_split].Target) # [80/] 10/10

print('Train set len:', len(train_series))
print('Validation set len:', len(val_series))
print('Test set len:', len(test_series))

Number of images: 6612
Train set len: 5289
Validation set len: 661
Test set len: 662


In [11]:
for patient_id in tqdm(train_series):
    src_path = TRAIN_SRC_DIR + patient_id + '.dcm'
    dcm_data = pydicom.dcmread(src_path)
    image_array = dcm_data.pixel_array
    image = Image.fromarray(image_array)
    image.save(DATASET_DIR + 'images/train/' + patient_id + '.jpg')
print('Images moved to train folder:', len(os.listdir(DATASET_DIR + 'images/train/')))
    
for patient_id in tqdm(val_series):
    src_path = TRAIN_SRC_DIR + patient_id + '.dcm'
    dcm_data = pydicom.dcmread(src_path)
    image_array = dcm_data.pixel_array
    image = Image.fromarray(image_array)
    image.save(DATASET_DIR + 'images/val/' + patient_id + '.jpg')
print('Images moved to val folder:', len(os.listdir(DATASET_DIR + 'images/val/')))

for patient_id in tqdm(test_series):
    src_path = TRAIN_SRC_DIR + patient_id + '.dcm'
    dcm_data = pydicom.dcmread(src_path)
    image_array = dcm_data.pixel_array
    image = Image.fromarray(image_array)
    image.save(DATASET_DIR + 'images/test/' + patient_id + '.jpg')
print('Images moved to test folder:', len(os.listdir(DATASET_DIR + 'images/test/')))

100%|██████████| 5289/5289 [01:09<00:00, 76.02it/s]


Images moved to train folder: 5289


100%|██████████| 661/661 [00:08<00:00, 76.39it/s]


Images moved to val folder: 661


100%|██████████| 662/662 [00:08<00:00, 75.50it/s]

Images moved to test folder: 662





In [15]:
def translate_bbox(bbox):
    img_size = 1024 # rsna defualt image size
    
    top_left_x = bbox[0]
    top_left_y = bbox[1]
    absolute_w = bbox[2]
    absolute_h = bbox[3]

    relative_w = absolute_w / img_size
    relative_h = absolute_h / img_size
    
    relative_x = top_left_x / img_size + relative_w / 2
    relative_y = top_left_y / img_size + relative_h / 2
    
    return relative_x, relative_y, relative_w, relative_h
    
def revert_bbox(rx, ry, rw, rh):
    img_size = 1024 # rsna defualt image size
    
    x = (rx-rw/2)*img_size
    y = (ry-rh/2)*img_size
    w = rw*img_size
    h = rh*img_size
    
    return x, y, w, h
    
    
def save_label(label_dir, patient_id, bbox):
    label_fp = os.path.join(label_dir, patient_id + '.txt')
    
    f = open(label_fp, "a")
    if (bbox == 'nan').all():
        f.close()
        return
    
    x, y, w, h = translate_bbox(bbox)
    
    line = f"0 {x} {y} {w} {h}\n"
    
    f.write(line)
    f.close()

In [18]:
LABELS_DIR = "./labels_temp/"
os.mkdir(LABELS_DIR)

for row in annotations.values:
    if pd.notna(row[1:5]).all():
        save_label(LABELS_DIR, row[0], row[1:5])
    
for patient_id in train_series:
    if os.path.isfile(LABELS_DIR + patient_id + '.txt'):
        shutil.copy(LABELS_DIR + patient_id + '.txt', DATASET_DIR + 'labels/train/')
    
for patient_id in val_series:
    if os.path.isfile(LABELS_DIR + patient_id + '.txt'):
        shutil.copy(LABELS_DIR + patient_id + '.txt', DATASET_DIR + 'labels/val/')
        
for patient_id in test_series:
    if os.path.isfile(LABELS_DIR + patient_id + '.txt'):
        shutil.copy(LABELS_DIR + patient_id + '.txt', DATASET_DIR + 'labels/test/')
    
shutil.rmtree(LABELS_DIR)

In [19]:
shutil.make_archive('runs_archive', 'zip', f'{DATASET_DIR}')

'/kaggle/working/runs_archive.zip'