# Data Prep

In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import pydicom
from sklearn.model_selection import GroupKFold
pd.set_option('display.max_colwidth',None)

In [2]:
img_train_path = './data/train/'
img_val_path = './data/val/'
train_csv_path = './data/train.csv'
new_csv_path = './data/new_train.csv'
lbl_train_path = './data/labels/train'
lbl_val_path = './data/labels/val'
label_dir = './data/yolo-labels'
os.makedirs(lbl_train_path, exist_ok = True)
os.makedirs(lbl_val_path, exist_ok = True)
os.makedirs(img_val_path, exist_ok = True)

In [3]:
# Reset
#df = pd.read_csv(new_csv_path).drop_duplicates('image_id')
#df = df[df['fold'] == 4]
#df
files = os.listdir(img_val_path)
for file in files:
    shutil.move(img_val_path + file, img_train_path)

In [4]:
train_df = pd.read_csv(train_csv_path)
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,


In [5]:
len(train_df.image_id.unique())

15000

# Only 14 Class

In [6]:
train_df = train_df[train_df.class_id != 14].reset_index(drop = True)

In [7]:
len(train_df.image_id.unique())

4394

# Split

In [8]:
fold = 4
gkf = GroupKFold(n_splits = 5)
images = train_df.drop_duplicates('image_id')[['image_id']].sort_values('image_id').reset_index(drop = True)
images['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(images, groups = images.image_id.tolist())):
    images.loc[val_idx, 'fold'] = fold
images.head()

Unnamed: 0,image_id,fold
0,0005e8e3701dfb1dd93d53e2ff537b6e,3
1,0007d316f756b3fa0baea2ff514ce945,0
2,000d68e42b71d3eac10ccc077aba07c1,4
3,00150343289f317a0ad5629d5b7d9ef9,3
4,001d127bad87592efe45a5c7678f8b8d,2


In [9]:
images.fold.value_counts()

3    879
2    879
1    879
0    879
4    878
Name: fold, dtype: int64

In [10]:
missing = []
def set_file_paths(row):
    if not os.path.exists(img_train_path + row.image_id + ('.dicom')):
        print(f'MISSING: {row.image_id}.dicom')
        missing.append(row.image_id)
        return "MISSING"
    
    if row['fold'] == 4:
        shutil.move((img_train_path + row.image_id + ('.dicom')), img_val_path)
        return img_val_path + row.image_id + ('.dicom')
        
    else:
        return img_train_path + row.image_id + ('.dicom')

In [11]:
images['image_path'] = images.apply(set_file_paths, axis = 1)
images

MISSING: 31901b2d1d2eb92c235f47f498b907c0.dicom
MISSING: 347180362348e522905047dde655b6d7.dicom
MISSING: 3c98c90840a9290511ae8192107f8250.dicom
MISSING: 4016b176eaf40d514b0559a0aeb48f21.dicom
MISSING: 415f0f58066bad3d69bd5fac2c80574b.dicom
MISSING: 42c049fd05428f7d606e9da4a95a8c3b.dicom
MISSING: 47ed17dcb2cbeec15182ed335a8b5a9e.dicom
MISSING: 4d01d09027d1d1e0513de4c8b4fc20e1.dicom
MISSING: 6833c509c10b569d6167db70b83ee5d5.dicom
MISSING: 6ce61a39f1e1bff629566de047ab8775.dicom
MISSING: 7043451da60cb89537435829965759c3.dicom
MISSING: 7653a1c4431f1929ae8c73588e39b8d5.dicom
MISSING: 7c6f191b5d28bc1992e491d906f0d1a5.dicom
MISSING: 9246c1061217b8131ccdaea80327e24d.dicom
MISSING: 98d44861c84d532bcca874fcde5e5f42.dicom
MISSING: 999612f847684578b1fdcf2d9d4d4994.dicom
MISSING: a3f5ac68c8d1b1805be21f18c47fc186.dicom
MISSING: b3cc453518cf826875edcd3d5778ee87.dicom
MISSING: b9f09915187c2cc01e958da90d97ac89.dicom
MISSING: ba46dcb445340df33566b52d7192ab6e.dicom
MISSING: bc2be005526db7ab9d5ec6741ddee94

Unnamed: 0,image_id,fold,image_path
0,0005e8e3701dfb1dd93d53e2ff537b6e,3,./data/train/0005e8e3701dfb1dd93d53e2ff537b6e.dicom
1,0007d316f756b3fa0baea2ff514ce945,0,./data/train/0007d316f756b3fa0baea2ff514ce945.dicom
2,000d68e42b71d3eac10ccc077aba07c1,4,./data/val/000d68e42b71d3eac10ccc077aba07c1.dicom
3,00150343289f317a0ad5629d5b7d9ef9,3,./data/train/00150343289f317a0ad5629d5b7d9ef9.dicom
4,001d127bad87592efe45a5c7678f8b8d,2,./data/train/001d127bad87592efe45a5c7678f8b8d.dicom
...,...,...,...
4389,ff924bcbd38f123aec723aa7040d7e43,0,./data/train/ff924bcbd38f123aec723aa7040d7e43.dicom
4390,ffb5d0b005261ed350f7a08c06613a34,4,./data/val/ffb5d0b005261ed350f7a08c06613a34.dicom
4391,ffceb71a80efba3b83c88e11f4b9694b,3,./data/train/ffceb71a80efba3b83c88e11f4b9694b.dicom
4392,ffe6f9fe648a7ec29a50feb92d6c15a4,4,./data/val/ffe6f9fe648a7ec29a50feb92d6c15a4.dicom


In [12]:
def dim_from_dicom(path):
    ds = pydicom.dcmread(path)
    return ds.get('Columns'), ds.get('Rows')

In [13]:
images = images[images.image_path != 'MISSING']
#images['width'], images['height'] = 0, 0
images['width'], images['height'] = zip(*images['image_path'].map(dim_from_dicom))

images.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,image_id,fold,image_path,width,height
0,0005e8e3701dfb1dd93d53e2ff537b6e,3,./data/train/0005e8e3701dfb1dd93d53e2ff537b6e.dicom,3072,3072
1,0007d316f756b3fa0baea2ff514ce945,0,./data/train/0007d316f756b3fa0baea2ff514ce945.dicom,2304,2880
2,000d68e42b71d3eac10ccc077aba07c1,4,./data/val/000d68e42b71d3eac10ccc077aba07c1.dicom,2304,2880
3,00150343289f317a0ad5629d5b7d9ef9,3,./data/train/00150343289f317a0ad5629d5b7d9ef9.dicom,2525,2508
4,001d127bad87592efe45a5c7678f8b8d,2,./data/train/001d127bad87592efe45a5c7678f8b8d.dicom,3072,3072


In [14]:
train_df = train_df.merge(images.drop('image_path', axis = 1), how = 'left', on = 'image_id')
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold,width,height
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,4.0,2080.0,2336.0
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,0.0,2304.0,2880.0
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,4.0,2540.0,3072.0
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,4.0,2285.0,2555.0
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,,,


In [15]:
train_df = train_df.dropna()
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold,width,height
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,4.0,2080.0,2336.0
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,0.0,2304.0,2880.0
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,4.0,2540.0,3072.0
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,4.0,2285.0,2555.0
5,d3637a1935a905b3c326af31389cb846,Aortic enlargement,0,R10,1329.0,743.0,1521.0,958.0,3.0,2304.0,2880.0


In [16]:
train_df['x_min'] = train_df.apply(lambda row: (row.x_min) / row.width, axis = 1)
train_df['y_min'] = train_df.apply(lambda row: (row.y_min) / row.height, axis = 1)

train_df['x_max'] = train_df.apply(lambda row: (row.x_max) / row.width, axis = 1)
train_df['y_max'] = train_df.apply(lambda row: (row.y_max) / row.height, axis = 1)

train_df['x_mid'] = train_df.apply(lambda row: (row.x_max + row.x_min) / 2, axis = 1)
train_df['y_mid'] = train_df.apply(lambda row: (row.y_max + row.y_min) / 2, axis = 1)

train_df['w'] = train_df.apply(lambda row: (row.x_max - row.x_min), axis = 1)
train_df['h'] = train_df.apply(lambda row: (row.y_max - row.y_min), axis = 1)

train_df['area'] = train_df['w'] * train_df['h']
train_df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold,width,height,x_mid,y_mid,w,h,area
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,0.332212,0.588613,0.794712,0.783818,4.0,2080.0,2336.0,0.563462,0.686216,0.4625,0.195205,0.090283
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,0.548611,0.257986,0.699219,0.353819,0.0,2304.0,2880.0,0.623915,0.305903,0.150608,0.095833,0.014433
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,0.24685,0.116211,0.372835,0.140951,4.0,2540.0,3072.0,0.309843,0.128581,0.125984,0.02474,0.003117
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,0.589497,0.09589,0.957549,0.848924,4.0,2285.0,2555.0,0.773523,0.472407,0.368053,0.753033,0.277156
5,d3637a1935a905b3c326af31389cb846,Aortic enlargement,0,R10,0.576823,0.257986,0.660156,0.332639,3.0,2304.0,2880.0,0.61849,0.295312,0.083333,0.074653,0.006221


# Save to csv

In [17]:
#if os.exists("")
train_df.set_index('image_id').to_csv(new_csv_path)