## Load packages and import tables

In [1]:
# this script is for preparing the training dataset and json files.
import pandas as pd
from duke_dbt_data import dcmread_image
import pydicom as dicom
import matplotlib.pyplot as plt
import numpy as np
import imageio
from PIL import Image, ImageOps
import mmcv
import json
from matplotlib import patches
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read data path
basic_path =  '/final-images/' # the path to save the images
csv_path ='./data_csv/' # the path to save the labels and tables
data_type = 'train-v2' # or val or test
path_table = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT file-paths-'+ data_type + '.csv'))
path_list = path_table['descriptive_path']
source_list = path_table['classic_path'].apply(lambda x: x.split('/',3)[3] if len(x.split('/')) > 3 else None)
# source_list = 'Breast-Cancer-Screening-DBT/' + source_list
view_list = path_table['View']
target_list = basic_path + data_type + '/manifest-xxxxxxxx/' + path_list # the list of your final dicom images path
box_table = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT boxes-'+ data_type + '.csv'))
label_list = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT labels-'+ data_type +'.csv'))

## Read and save dicom images as png slices

In [11]:
df = pd.DataFrame(columns = ['StudyUID', 'view','img_path', 'Normal','Actionable','Benign' ,'Cancer'])
df_not_found = pd.DataFrame(columns = ['Path'])

In [12]:
save_path = './images/' + data_type + '/'
img_cnt = 0
not_found_cnt = 0
for i in range(len(target_list)):
    # get the side of this image, the first letter
    side = view_list[i][0]
    # check if dicom image exists
    dicom_path = '/data/md311/Breast-Cancer-Screening-DBT/'+data_type+'/'+source_list[i]
    if os.path.exists(dicom_path):
        #read dicom image
        img = dcmread_image(fp=dicom_path, view=view_list[i])
        # match the image in the box_table
        col1 = box_table['StudyUID'] == path_table['StudyUID'][i]
        col2 = box_table['View'] == view_list[i]
        col_final = box_table[col1&col2]
        if len(col_final)==0: # this image don't have nodule, save the mid slice
            # this image don't have box
            center_slice = int(img.shape[0]/2)
            img_slice = img[center_slice]
            slice_name = path_table['StudyUID'][i] + '_' + view_list[i] + '_' + str(center_slice) + '.png'
            #check if image already exists in target folder
            if not os.path.exists(save_path + slice_name):
                print(i,slice_name,"...",sep='')
                if side == 'r':
                    img_slice = img_slice[:,::-1]
                imageio.imwrite(save_path + slice_name,img_slice) 
            else:
                print(i,slice_name)
            df.loc[img_cnt] = [path_table['StudyUID'][i], view_list[i],save_path + slice_name, label_list['Normal'][i], label_list['Actionable'][i], label_list['Benign'][i], label_list['Cancer'][i]]
            img_cnt += 1
        else: # this image has nodules, choose the center slice which contains nodule
            for j in range(len(col_final)):
                slice_now = int(col_final['Slice'].iloc[j])
                img_slice = img[slice_now]
                slice_name = path_table['StudyUID'][i] + '_' + view_list[i] + '_' + str(slice_now) + '.png'
                #check if image already exists in target folder
                if not os.path.exists(save_path + slice_name):
                    print(i,slice_name,"...",sep='')
                    if side == 'r': # we flip all the breast in right side to left
                        img_slice = img_slice[:,::-1]
                    imageio.imwrite(save_path + slice_name,img_slice) 
                else:
                    print(i,slice_name)
                df.loc[img_cnt] = [path_table['StudyUID'][i], view_list[i],save_path + slice_name, label_list['Normal'][i], label_list['Actionable'][i], label_list['Benign'][i], label_list['Cancer'][i]]
                img_cnt += 1
        df.to_csv('table_list_slice.csv',index = False, header=True)
    else:
        print("File not found:",dicom_path)
        df_not_found.loc[not_found_cnt] = [dicom_path]
        not_found_cnt += 1
        df_not_found.to_csv('not_found.csv',index=False, header=True)

File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.97979602815077649368346148322369132081/1-1.dcm
File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.12136582480949936067549454937434114072/1-1.dcm
File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.90045035130681803298603121403642951654/1-1.dcm
File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.10822555886306795549982251172752709387/1-1.dcm
File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.29938515490857039234478950570192161579/1-1.dcm
File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.81499742653858333874375993299750991029/1-1.dcm
File not found: /data/md311/bBreast-Cancer-Screening-DBT/train-v2/1.2.826.0.1.3680043.8.498.39826337616810443783818414516357453289/1-1.dcm
File not found: /data/md311

In [6]:
label_list.to_csv("breast")

',StudyUID,view,img_path,Normal,Actionable,Benign,Cancer\n0,DBT-S00163,rmlo,./images/train-v2/DBT-S00163_rmlo_16.png,0,0,1,0\n1,DBT-S04378,lcc,./images/train-v2/DBT-S04378_lcc_31.png,0,1,0,0\n2,DBT-S04378,lmlo,./images/train-v2/DBT-S04378_lmlo_33.png,0,1,0,0\n3,DBT-S04378,rcc,./images/train-v2/DBT-S04378_rcc_29.png,0,1,0,0\n4,DBT-S04378,rmlo,./images/train-v2/DBT-S04378_rmlo_31.png,0,1,0,0\n5,DBT-S03255,lcc,./images/train-v2/DBT-S03255_lcc_19.png,0,0,1,0\n6,DBT-S03255,lcc,./images/train-v2/DBT-S03255_lcc_37.png,0,0,1,0\n7,DBT-S03255,lmlo,./images/train-v2/DBT-S03255_lmlo_11.png,0,0,1,0\n8,DBT-S03255,lmlo,./images/train-v2/DBT-S03255_lmlo_12.png,0,0,1,0\n9,DBT-S00044,lcc,./images/train-v2/DBT-S00044_lcc_35.png,1,0,0,0\n10,DBT-S00044,lmlo,./images/train-v2/DBT-S00044_lmlo_34.png,1,0,0,0\n11,DBT-S00044,rcc,./images/train-v2/DBT-S00044_rcc_32.png,1,0,0,0\n12,DBT-S00044,rmlo,./images/train-v2/DBT-S00044_rmlo_32.png,1,0,0,0\n13,DBT-S04706,lcc,./images/train-v2/DBT-S04706_lcc_34.png,1,0,0,0\n

## make detection json file as ground truth

In [3]:
labels_path = 'table_list_slice.csv'#'./data_csv/BCS-DBT labels-train-v2.csv'
label_list = pd.read_csv(labels_path)
col1 = label_list['Benign']==1
col2 = label_list['Cancer']==1
img_use = label_list[col1|col2] # we combine benign and cancer as the same type
dataset_dicts = []
save_dir = 'images/' + data_type
i = 0 
for id in img_use.index:
    print("id",id)
    record = {}
    ann = []
    img_path = label_list['img_path'][id]
    image = mmcv.imread(img_path)
    # get box
    col1 = box_table['StudyUID'] ==  label_list['StudyUID'][id]
    col2 = box_table['View'] ==  label_list['view'][id]
    #col3 = box_table['Slice'] == slice
    col_final = box_table[col1&col2]
    record["file_name"] = img_path
    record["image_id"] = int(i)
    record["height"] = int(image.shape[0])
    record["width"] = int(image.shape[1])
    for index,line in col_final.iterrows():
        x = line['X']
        y = line['Y']
        Width = line['Width']
        Height = line['Height']
        side = line['View'][0]
        if side == 'r':
            x = image.shape[1]-1-x-Width
        box = [int(x),int(y),int(x+Width), int(y+Height)]
        obj = {'bbox':box, 
                   "bbox_mode": 0,
                   "segmentation":[],
                   "category_id": 0,}
        ann.append(obj)
    i +=1
    record["annotations"] = ann
    dataset_dicts.append(record)
new_dict = {}
for i in dataset_dicts:
    new_dict[i["file_name"]] = i
with open(save_dir+"sample.json", "w") as outfile: 
    json.dump(new_dict, outfile)

id 0
id 5
id 6
id 7
id 8
id 25
id 34
id 35
id 104
id 117
id 118
id 147
id 148
id 149
id 150
id 151
id 152
id 169
id 170
id 251
id 252
id 253
id 254
id 255
id 256
id 265
id 266
id 295
id 308
id 309
id 350
id 351
id 352
id 369
id 370
id 375
id 376
id 541
id 542
id 543
id 544
id 545
id 546
id 547
id 548
id 549
id 558
id 559
id 604
id 605
id 618
id 619
id 620
id 621
id 626
id 627
id 640
id 641
id 642
id 663
id 668
id 669
id 686
id 687
id 692
id 693
id 710
id 711
id 736
id 737
id 742
id 743
id 744
id 745
id 766
id 767
id 788
id 789
id 794
id 795
id 820
id 821
id 822
id 823
id 824
id 857
id 858
id 867
id 868
id 873
id 874
id 875
id 876
id 993
id 994
id 995
id 996
id 1001
id 1014
id 1015
id 1036
id 1037
id 1038
id 1039
id 1044
id 1045
id 1086
id 1087
id 1160
id 1161
id 1206
id 1207
id 1220
id 1221
id 1222
id 1231
id 1232
id 1241
id 1242
id 1243
id 1244
id 1273
id 1274
id 1275
id 1276
id 1277
id 1278
id 1331
id 1332
id 1333
id 1334
id 1335
id 1336
id 1357
id 1358
id 1359
id 1380
id 1381
id 144

In [4]:
img_use

Unnamed: 0,StudyUID,view,img_path,Normal,Actionable,Benign,Cancer
0,DBT-S00163,rmlo,./images/train-v2/DBT-S00163_rmlo_16.png,0,0,1,0
5,DBT-S03255,lcc,./images/train-v2/DBT-S03255_lcc_19.png,0,0,1,0
6,DBT-S03255,lcc,./images/train-v2/DBT-S03255_lcc_37.png,0,0,1,0
7,DBT-S03255,lmlo,./images/train-v2/DBT-S03255_lmlo_11.png,0,0,1,0
8,DBT-S03255,lmlo,./images/train-v2/DBT-S03255_lmlo_12.png,0,0,1,0
...,...,...,...,...,...,...,...
2607,DBT-S05569,rmlo,./images/train-v2/DBT-S05569_rmlo_24.png,0,0,0,1
2608,DBT-S05588,rcc,./images/train-v2/DBT-S05588_rcc_41.png,0,0,1,0
2609,DBT-S05588,rmlo,./images/train-v2/DBT-S05588_rmlo_41.png,0,0,1,0
2618,DBT-S01839,rcc,./images/train-v2/DBT-S01839_rcc_44.png,0,0,0,1


## visualize the slices with bounding boxes

In [None]:
# draw with bounding box
dataset_dicts = []
save_dir = 'images/' + data_type
i = 0 
for id in img_use.index:
    record = {}
    ann = []
    img_path = label_list['img_path'][id].replace('crop_val','val')
    image = mmcv.imread(img_path)
    fig, ax = plt.subplots()
    ax.imshow(image,cmap='Greys_r')
    # get box
    box_table = pd.read_csv('./data_csv/BCS-DBT boxes-train-v2.csv')
    col1 = box_table['StudyUID'] ==  label_list['StudyUID'][id]
    col2 = box_table['View'] ==  label_list['view'][id]
    #col3 = box_table['Slice'] == slice
    col_final = box_table[col1&col2]
    for index,line in col_final.iterrows():
        x = line['X']
        y = line['Y']
        Width = line['Width']
        Height = line['Height']
        side = line['View'][0]
        if side == 'r':
            x = image.shape[1]-1-x-Width
        box = [int(x),int(y),int(x+Width), int(y+Height)]
        rect = patches.Rectangle((x, y), Width,Height, linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
    save_name = img_path.replace('train','train&box')
    fig.savefig(save_name)