# Data Preparation and Pre-processing

In [1]:
import pandas as pd
from duke_dbt_data import dcmread_image
import matplotlib.pyplot as plt
import imageio
import mmcv
import json
from matplotlib import patches
import os



In [13]:
# read data path
basic_path =  '/final-images/' # the path to save the images
csv_path ='/data/md311/Breast_Cancer_Detection_DBT/dataset/data_csv/' # the path to save the labels and tables
data_type = 'train-v2' # or val or test
path_table = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT file-paths-'+ data_type + '.csv'))
path_list = path_table['descriptive_path']
source_list = path_table['classic_path'].apply(lambda x: x.split('/',3)[3] if len(x.split('/')) > 3 else None)
# source_list = 'Breast-Cancer-Screening-DBT/' + source_list
view_list = path_table['View']
target_list = basic_path + data_type + '/manifest-xxxxxxxx/' + path_list # the list of your final dicom images path
box_table = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT boxes-'+ data_type + '.csv'))
label_list = pd.DataFrame(pd.read_csv(csv_path +'BCS-DBT labels-'+ data_type +'.csv'))

## Read and save dicom images as png slices

In [11]:
df = pd.DataFrame(columns = ['StudyUID', 'view','img_path', 'Normal','Actionable','Benign' ,'Cancer'])
df_not_found = pd.DataFrame(columns = ['Path'])

In [None]:
save_path = './images/' + data_type + '/'
img_cnt = 0
not_found_cnt = 0
for i in range(len(target_list)):
    # get the side of this image, the first letter
    side = view_list[i][0]
    # check if dicom image exists
    dicom_path = '/data/md311/Breast-Cancer-Screening-DBT/'+data_type+'/'+source_list[i]
    if os.path.exists(dicom_path):
        #read dicom image
        img = dcmread_image(fp=dicom_path, view=view_list[i])
        # match the image in the box_table
        col1 = box_table['StudyUID'] == path_table['StudyUID'][i]
        col2 = box_table['View'] == view_list[i]
        col_final = box_table[col1&col2]
        if len(col_final)==0: # this image don't have nodule, save the mid slice
            # this image don't have box
            center_slice = int(img.shape[0]/2)
            img_slice = img[center_slice]
            slice_name = path_table['StudyUID'][i] + '_' + view_list[i] + '_' + str(center_slice) + '.png'
            #check if image already exists in target folder
            if not os.path.exists(save_path + slice_name):
                print(i,slice_name,"...",sep='')
                if side == 'r':
                    img_slice = img_slice[:,::-1]
                imageio.imwrite(save_path + slice_name,img_slice) 
            else:
                print(i,slice_name)
            df.loc[img_cnt] = [path_table['StudyUID'][i], view_list[i],save_path + slice_name, label_list['Normal'][i], label_list['Actionable'][i], label_list['Benign'][i], label_list['Cancer'][i]]
            img_cnt += 1
        else: # this image has nodules, choose the center slice which contains nodule
            for j in range(len(col_final)):
                slice_now = int(col_final['Slice'].iloc[j])
                img_slice = img[slice_now]
                slice_name = path_table['StudyUID'][i] + '_' + view_list[i] + '_' + str(slice_now) + '.png'
                #check if image already exists in target folder
                if not os.path.exists(save_path + slice_name):
                    print(i,slice_name,"...",sep='')
                    if side == 'r': # we flip all the breast in right side to left
                        img_slice = img_slice[:,::-1]
                    imageio.imwrite(save_path + slice_name,img_slice) 
                else:
                    print(i,slice_name)
                df.loc[img_cnt] = [path_table['StudyUID'][i], view_list[i],save_path + slice_name, label_list['Normal'][i], label_list['Actionable'][i], label_list['Benign'][i], label_list['Cancer'][i]]
                img_cnt += 1
        df.to_csv(csv_path+data_type+'_table_list_slice.csv',index = False, header=True)
    else:
        print("File not found:",dicom_path)
        df_not_found.loc[not_found_cnt] = [dicom_path]
        not_found_cnt += 1
        df_not_found.to_csv(csv_path+'not_found.csv',index=False, header=True)

## Create CSV 

In [24]:
df = pd.read_csv(csv_path+'train-v2_table_list_slice.csv')
df[0:11]

Unnamed: 0,StudyUID,view,img_path,Normal,Actionable,Benign,Cancer
0,DBT-S00163,rmlo,./images/train-v2/DBT-S00163_rmlo_16.png,0,0,1,0
1,DBT-S04378,lcc,./images/train-v2/DBT-S04378_lcc_31.png,0,1,0,0
2,DBT-S04378,lmlo,./images/train-v2/DBT-S04378_lmlo_33.png,0,1,0,0
3,DBT-S04378,rcc,./images/train-v2/DBT-S04378_rcc_29.png,0,1,0,0
4,DBT-S04378,rmlo,./images/train-v2/DBT-S04378_rmlo_31.png,0,1,0,0
5,DBT-S03255,lcc,./images/train-v2/DBT-S03255_lcc_19.png,0,0,1,0
6,DBT-S03255,lcc,./images/train-v2/DBT-S03255_lcc_37.png,0,0,1,0
7,DBT-S03255,lmlo,./images/train-v2/DBT-S03255_lmlo_11.png,0,0,1,0
8,DBT-S03255,lmlo,./images/train-v2/DBT-S03255_lmlo_12.png,0,0,1,0
9,DBT-S00044,lcc,./images/train-v2/DBT-S00044_lcc_35.png,1,0,0,0
