## Load packages and import tables

In [1]:
# this s cript is for preparing the training dataset and json files.
import pandas as pd
from duke_dbt_data import dcmread_image
import pydicom as dicom
import matplotlib.pyplot as plt
import numpy as np
import imageio
from PIL import Image, ImageOps

In [None]:
# read data path
basic_path =  'the path to save dicom images' # the path to save the images
csv_path ='./data_csv/'# the path to svae the labels and tables
data_type ='train' # or val or test
path_table = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT file-paths-'+ data_type + '.csv'))
path_list = path_table['descriptive_path']
view_list = path_table['View']
target_list = basic_path + data_type + '/manifest-xxxxxxxx/' + path_list # the list of your final dicom images path
box_table = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT boxes-'+ data_type + '.csv'))
label_list = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT labels-'+ data_type +'.csv'))

## Read and save dicom images as png slices

In [None]:
df = pd.DataFrame(columns = ['StudyUID', 'view','img_path', 'Normal','Actionable','Benign' ,'Cancer'])

In [None]:
save_path = './images/' + data_type + '/'
img_cnt = 0
for i in range(len(target_list)):
    # get the side of this image, the first letter
    side = view_list[i][0]
    # read dicom image
    img = dcmread_image(fp = target_list[i], view = view_list[i])
    # match the image in the box_table
    col1 = box_table['StudyUID'] == path_table['StudyUID'][i]
    col2 = box_table['View'] == view_list[i]
    col_final = box_table[col1&col2]
    if len(col_final)==0: # this image don't have nodule, save the mid slice
        # this image don't have box
        center_slice = int(img.shape[0]/2)
        img_slice = img[center_slice]
        slice_name = path_table['StudyUID'][i] + '_' + view_list[i] + '_' + str(center_slice) + '.png'
        print(slice_name)
        if side == 'r':
            img_slice = img_slice[:,::-1]
        imageio.imwrite(save_path + slice_name,img_slice) 
        df.loc[img_cnt] = [path_table['StudyUID'][i], view_list[i],save_path + slice_name, label_list['Normal'][i], label_list['Actionable'][i], label_list['Benign'][i], label_list['Cancer'][i]]
        img_cnt += 1
    else: # this image has nodules, choose the center slice which contains nodule
        for j in range(len(col_final)):
            slice_now = int(col_final['Slice'].iloc[j])
            img_slice = img[slice_now]
            slice_name = path_table['StudyUID'][i] + '_' + view_list[i] + '_' + str(slice_now) + '.png'
            print(slice_name)
            if side == 'r': # we flip all the breast in right side to left
                img_slice = img_slice[:,::-1]
            imageio.imwrite(save_path + slice_name,img_slice) 
            df.loc[img_cnt] = [path_table['StudyUID'][i], view_list[i],save_path + slice_name, label_list['Normal'][i], label_list['Actionable'][i], label_list['Benign'][i], label_list['Cancer'][i]]
            img_cnt += 1
    df.to_csv('table_list_slice.csv',index = False, header=True)

## make detection json file as ground truth

In [None]:
label_list = pd.read_csv('train_labels.csv')
col1 = label_list['Benign']==1
col2 = label_list['Cancer']==1
img_use = label_list[col1|col2] # we combine benign and cancer as the same type
dataset_dicts = []
save_dir = 'images/' + data_type
i = 0 
for id in img_use.index:
    record = {}
    ann = []
    img_path = label_list['img_path'][id]
    image = mmcv.imread(img_path)
    # get box
    col1 = box_table['StudyUID'] ==  label_list['StudyUID'][id]
    col2 = box_table['View'] ==  label_list['view'][id]
    #col3 = box_table['Slice'] == slice
    col_final = box_table[col1&col2]
    record["file_name"] = img_path
    record["image_id"] = int(i)
    record["height"] = int(image.shape[0])
    record["width"] = int(image.shape[1])
    for index,line in col_final.iterrows():
        x = line['X']
        y = line['Y']
        Width = line['Width']
        Height = line['Height']
        side = line['View'][0]
        if side == 'r':
            x = image.shape[1]-1-x-Width
        box = [int(x),int(y),int(x+Width), int(y+Height)]
        obj = {'bbox':box, 
                   "bbox_mode": 0,
                   "segmentation":[],
                   "category_id": 0,}
        ann.append(obj)
    i +=1
    record["annotations"] = ann
    dataset_dicts.append(record)
new_dict = {}
for i in dataset_dicts:
    new_dict[i["file_name"]] = i
with open(save_dir+"sample.json", "w") as outfile: 
    json.dump(new_dict, outfile)

## visualize the slices with bounding boxes

In [None]:
# draw with bounding box
dataset_dicts = []
save_dir = 'images/' + data_type
i = 0 
for id in img_use.index:
    record = {}
    ann = []
    img_path = label_list['img_path'][id].replace('crop_val','val')
    image = mmcv.imread(img_path)
    fig, ax = plt.subplots()
    ax.imshow(image,cmap='Greys_r')
    # get box
    box_table = pd.read_csv('bboxes_new.csv')
    col1 = box_table['StudyUID'] ==  label_list['StudyUID'][id]
    col2 = box_table['View'] ==  label_list['view'][id]
    #col3 = box_table['Slice'] == slice
    col_final = box_table[col1&col2]
    for index,line in col_final.iterrows():
        x = line['X']
        y = line['Y']
        Width = line['Width']
        Height = line['Height']
        side = line['View'][0]
        if side == 'r':
            x = image.shape[1]-1-x-Width
        box = [int(x),int(y),int(x+Width), int(y+Height)]
        rect = patches.Rectangle((x, y), Width,Height, linewidth=1, edgecolor='r', facecolor='none')
        ax.add_patch(rect)
    save_name = img_path.replace('train','train&box')
    fig.savefig(save_name)