# Phase 1: Data Basic Preporcessing

## Libraries and Data Loading

### Import Libraries

In [20]:
# Import libraries
import pandas as pd
import os

### Load Data
The CBIS-DDSM datset contains: 
- A metadata file in csv format
- Four descriptions files in csv format
- One directory with images stored in directories with a path based on the description of the case.

In [243]:
# Load data from metadata file
meta_path = 'CBIS-DDSM_Data/images_dataset/metadata.csv' 
metadata_df = pd.read_csv(meta_path)

# Load data from csv data files for training and testing sets
mass_train_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_train_set.csv")
mass_test_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_test_set.csv")
calc_train_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_train_set.csv")
calc_test_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_test_set.csv")

Example of an image's path    

In [244]:
calc_test_orig["image file path"][0]

'Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009/1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992/000000.dcm'

## Extract Metadata and Description Data

### Prepare Description Data
#### Match and Combine Description Data
Currently the mass and calcification description files have different names. The names are matched to combine files

In [245]:
# Columns current names and new names
col_new_names = ["breast_density", "breast_side", "image_view", "abnormality_type", "pathology", "image_path", "cropped_image_path", "ROI_mask_path"] 
mass_names = ["breast_density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]
calc_names = ["breast density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]

In [246]:
# Rename columns
mass_train_renamed = mass_train_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
mass_test_renamed = mass_test_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
calc_train_renamed = calc_train_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})
calc_test_renamed = calc_test_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})

In [247]:
# Print number of cases for file
print("Original Number of Cases:")
print(" - Mass trained cases: ", len(mass_train_renamed))
print(" - Mass trained cases: ", len(mass_test_renamed))
print(" - Calc trained cases: ", len(calc_train_renamed))
print(" - Calc trained cases: ", len(calc_test_renamed))

Original Number of Cases:
 - Mass trained cases:  1318
 - Mass trained cases:  378
 - Calc trained cases:  1546
 - Calc trained cases:  326


In [248]:
# Combine needed columns and print number of cases combining training and testing data
train_orig = []
test_orig = []
train_orig = pd.concat([mass_train_renamed[col_new_names], calc_train_renamed[col_new_names]], ignore_index=True)
test_orig = pd.concat([mass_test_renamed[col_new_names], calc_test_renamed[col_new_names]], ignore_index=True)
print("New Number of Cases per Set:")
print(" - Train cases: ", len(train_orig))
print(" - Test cases: ", len(test_orig))
print("\n", train_orig.columns)

New Number of Cases per Set:
 - Train cases:  2864
 - Test cases:  704

 Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path'],
      dtype='object')


In [249]:
# Add a split marker column
train_orig["split"] = ["train"] * len(train_orig) 
test_orig["split"] = ["test"] * len(test_orig) 

In [250]:
print(train_orig.columns)

Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path',
       'split'],
      dtype='object')


In [251]:
test_orig.columns == train_orig.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True])

#### Extract Series UIDs from Description Paths 
The decompressed images are stored in one directory where the name is the series ID number generated using the metadata file. To connect the description with its respective image already decompresed, the series UIDs are extracted from paths from descriptions files to match with images stored in images_png 

In [252]:
# Function for extracting description series UIDs from paths stored in the description files 
def get_series_from_descriptions(im_types, new_cols, train, test, new_train, new_test):
    for im_type, col in zip(im_types, new_cols):
        train_set = [serie.split("/")[2] for serie in train[im_type]]
        test_set = [serie.split("/")[2] for serie in test[im_type]]
        new_train[col] = train_set
        new_test[col] = test_set
    return new_train, new_test

In [253]:
# Extract description series UIDs from paths stored in the description files 
train_desc, test_desc = get_series_from_descriptions(["image_path", "cropped_image_path", "ROI_mask_path"], 
                                                     ["series_uid_full", "series_uid_crop", "series_uid_roi"], 
                                                     train_orig, test_orig, train_desc, test_desc)

In [254]:
# Examples of the description of two cases. The new columns are included. 
print("Two examples of description data cases with series_uid included: \n")
print(train_desc.iloc[0], "\n")
print(train_desc.iloc[len(train_desc)-1])

Two examples of description data cases with series_uid included: 

breast_density                                                        3
breast_side                                                        LEFT
image_view                                                           CC
abnormality_type                                                   mass
pathology                                                     MALIGNANT
image_path            Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...
cropped_image_path    Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
ROI_mask_path         Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
split                                                             train
series_uid_full       1.3.6.1.4.1.9590.100.1.2.342386194811267636608...
series_uid_crop       1.3.6.1.4.1.9590.100.1.2.296736403313792599626...
series_uid_roi        1.3.6.1.4.1.9590.100.1.2.296736403313792599626...
Name: 0, dtype: object 

breast_density                              

### Extract Data
#### Images Data
Gets images new locations and make a new data frame. 
Create a new data frame with information of each image. For each image saved: 
- extract series_uid
- find its respective information in the description (train/test) sets
- find its respective metadata. 

In [255]:
# Extracts paths from images stored in images_png directory 
path_to_imgs = "CBIS-DDSM_Clean_Data/labeled_images_png/"
images_paths = os.listdir(path_to_imgs)


print("Number of images in total:")
print(" - ", len(images_paths))

print("\nExamples of PNG images paths:")
images_paths[:2]

Number of images in total:
 -  10239

Examples of PNG images paths:


['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132_cropped.png',
 '1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132_roi.png']

In [256]:
# creates a list of series UIDs by removing file extension and last 3 digits 
images_series = [path.split("_")[0] for path in images_paths]
images_series[:3]

['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547']

In [257]:
# verify the series are unique
print("Unique paths:", len(set(images_paths)), "out of ", len(images_paths))
print("Unique series:", len(set(images_series)), "out of ", len(images_series))

Unique paths: 10239 out of  10239
Unique series: 6775 out of  10239


#### Extract metadata
As we can see in the folowing cell, some series cantain more than one images. For that reason, the path can be used to diferenciate images by adding a column with the type of image.

In [258]:
# Function for matching images paths to metadata 
def get_images_metadata(images_series, metadata_series, metadata_col):
    new_col = []
    # uses description series uids
    for serie in images_series:
        # makes sure the series is in the metadata
        if serie in list(metadata_series):
            index = list(metadata_series).index(serie)
            new_col.append(metadata_col[index])
        else: 
            new_col.append(None)
            print(serie, "not found")
            
    return new_col
            

In [259]:
# Create dataframe and add metadata
images_data = pd.DataFrame()

base_path = "CBIS-DDSM_Clean_Data/labeled_images_png/"
imges_types = [path.split("_")[1].split(".")[0] for path in images_paths]
mapping = {"full": "00", "cropped": "01", "roi": "02"}
# Adds metadata to new dataframe
images_data["image_id"] = [serie[-5:] + "_" + mapping[im_type] for serie, im_type in zip(images_series, imges_types)]
images_data["image_type"] = imges_types
images_data["image_path"] = [base_path + im for im in images_paths]
images_data["series_uid"] = images_series
images_data["subject_id"] = get_images_metadata(images_series, metadata_df["Series UID"], metadata_df["Subject ID"])
images_data["study_uid"] = get_images_metadata(images_series, metadata_df["Series UID"], metadata_df["Study UID"])


In [260]:
images_data.iloc[:3]

Unnamed: 0,image_id,image_type,image_path,series_uid,subject_id,study_uid
0,91132_01,cropped,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100018879311824535125...,Calc-Training_P_01128_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.346810468613236696616...
1,91132_02,roi,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100018879311824535125...,Calc-Training_P_01128_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.346810468613236696616...
2,51547_00,full,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100131208110604806117...,Calc-Training_P_01107_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.113816182611334006337...


#### Extract description data
The description files contain three columns to specify the series that are part of one description. However, to find the data we need to identify what series column should be used (full, roi, or cropped). Moreover, it needs to be verifyied if all matched images found per column  match their respective data.

In [261]:
images_data["image_type"][:4]

0    cropped
1        roi
2       full
3       full
Name: image_type, dtype: object

In [262]:
# Function for matching images paths to description data 
def get_images_description(images_series, image_types, descr_series_cols, descr_data):
    rows = [None] * len(images_series)
    multi_label_images = []
    not_found = []
    for i, serie, im_type in zip(range(len(images_series)), images_series, image_types):
        
        # makes sure the series is in the descriptions data 
        if im_type == "full": 
            matched_rows = descr_data[descr_series_cols[0] == serie]
        elif im_type == "cropped":
            matched_rows = descr_data[descr_series_cols[1] == serie]
        elif im_type == "roi":
            matched_rows = descr_data[descr_series_cols[2] == serie]
            
        # Makes sure the data is identical in the rows found
        if len(matched_rows) > 1:
            identical = (matched_rows == matched_rows.iloc[0]).all().all()
            if identical:
                rows[i] = list(matched_rows.iloc[0])
            else:
                rows[i] = [None] * len(matched_rows.iloc[0])
                multi_label_images.append(serie)
        elif len(matched_rows) == 1:
            rows[i] = list(matched_rows.iloc[0])
        else:
            rows[i] = [None] * 6
            not_found.append(serie)
    print("Number of cases that contain multiple labels for one image: ", len(multi_label_images))
    print("Number of series that where NOT found in the description data: ", len(not_found))
    return rows, multi_label_images, not_found
        
        



In [263]:
# Adds Description data to new dataframe
col_names = ["breast_density", "breast_side", "image_view", "abnormality_type", "pathology", "split", "series_uid_full", "series_uid_crop", "series_uid_roi"] 
all_desc_data = pd.concat([train_desc[col_names], test_desc[col_names]], ignore_index=True)
images_desc, multi_label_cases, not_found_cases = get_images_description(images_series,
                                                 images_data["image_type"],
                                                 [all_desc_data["series_uid_full"], all_desc_data["series_uid_crop"], all_desc_data["series_uid_roi"]],
                                                 all_desc_data[all_desc_data.columns[:6]] )
images_desc[0]

Number of cases that contain multiple labels for one image:  19
Number of series that where NOT found in the description data:  2


[np.int64(3), 'RIGHT', 'CC', 'calcification', 'MALIGNANT', 'train']

#### Combine metadata, paths and description data

In [264]:
# create a dataframe 
images_desc_df = pd.DataFrame(images_desc, columns=col_names[:6])
images_desc_df[:4]

Unnamed: 0,breast_density,breast_side,image_view,abnormality_type,pathology,split
0,3.0,RIGHT,CC,calcification,MALIGNANT,train
1,3.0,RIGHT,CC,calcification,MALIGNANT,train
2,2.0,LEFT,CC,calcification,BENIGN,train
3,2.0,LEFT,MLO,mass,MALIGNANT,test


In [265]:
# combine images data (files paths and metadata) with description data 
all_images_data = pd.DataFrame()
all_images_data = pd.concat([images_data, images_desc_df], axis=1)
all_images_data.iloc[4765]

image_id                                                     91565_02
image_type                                                        roi
image_path          CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....
series_uid          1.3.6.1.4.1.9590.100.1.2.276112479100234745155...
subject_id                           Calc-Training_P_00474_LEFT_MLO_1
study_uid           1.3.6.1.4.1.9590.100.1.2.317367128312662574506...
breast_density                                                    NaN
breast_side                                                      None
image_view                                                       None
abnormality_type                                                 None
pathology                                                        None
split                                                            None
Name: 4765, dtype: object

In [266]:
len(all_images_data)

10239

In [267]:
# remove rows that include None
all_images_data = all_images_data[all_images_data["pathology"].notna()]
len(all_images_data) 

10218

### Spliting Stage
#### Separate full images, cropped and ROIs masks

In [268]:
# Separate type of images by group 
full_images = all_images_data[all_images_data["image_type"] == "full"]
cropped_images = all_images_data[all_images_data["image_type"] == "cropped"]
roi_images = all_images_data[all_images_data["image_type"] == "roi"]
full_images[:4]

Unnamed: 0,image_id,image_type,image_path,series_uid,subject_id,study_uid,breast_density,breast_side,image_view,abnormality_type,pathology,split
2,51547_00,full,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100131208110604806117...,Calc-Training_P_01107_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.113816182611334006337...,2.0,LEFT,CC,calcification,BENIGN,train
3,62869_00,full,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100522099512256189513...,Mass-Test_P_00576_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.286066835611001826026...,2.0,LEFT,MLO,mass,MALIGNANT,test
4,45647_00,full,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100522676511025180541...,Mass-Test_P_01510_RIGHT_MLO,1.3.6.1.4.1.9590.100.1.2.403479789212123359501...,4.0,RIGHT,MLO,mass,BENIGN,test
7,99884_00,full,CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....,1.3.6.1.4.1.9590.100.1.2.100579676611077714807...,Calc-Training_P_00685_RIGHT_CC,1.3.6.1.4.1.9590.100.1.2.137434121012998881531...,3.0,RIGHT,CC,calcification,MALIGNANT,train


#### Split data based on split marker

In [269]:
train_full = full_images[full_images["split"] == "train"]
test_full = full_images[full_images["split"] == "test"]
total_full = (len(train_full) + len(test_full))
print("\nTraining set size:", len(train_full))
print("Testing set size:", len(test_full), "\n")
print("Total full images cases: ", total_full)
print("Train/Test ratio:", (len(train_full) / total_full), " / ", (len(test_full) / total_full))


Training set size: 2443
Testing set size: 641 

Total full images cases:  3084
Train/Test ratio: 0.7921530479896238  /  0.20784695201037615


In [270]:
print(train_full.iloc[0], "\n")
print(test_full.iloc[0])

image_id                                                     51547_00
image_type                                                       full
image_path          CBIS-DDSM_Clean_Data/labeled_images_png/1.3.6....
series_uid          1.3.6.1.4.1.9590.100.1.2.100131208110604806117...
subject_id                              Calc-Training_P_01107_LEFT_CC
study_uid           1.3.6.1.4.1.9590.100.1.2.113816182611334006337...
breast_density                                                    2.0
breast_side                                                      LEFT
image_view                                                         CC
abnormality_type                                        calcification
pathology                                                      BENIGN
split                                                           train
Name: 2, dtype: object 

image_id                                                     62869_00
image_type                                                       

### Labels Distribution 
The following code shows the number of cases per class.

In [292]:
total = len(train_full["pathology"])
malignant = len(train_full["pathology"][train_full["pathology"] == "MALIGNANT"])
benignant = len(train_full["pathology"][train_full["pathology"] == "BENIGN"])
benignant_wo_call = len(train_full["pathology"][train_full["pathology"] == "BENIGN_WITHOUT_CALLBACK"])

print("Total of training cases: ", total)
print("Total of benign + malignant cases: ", malignant + benignant, "\n")
print("Malignant: ", malignant, "cases.", malignant * 100 / (malignant + benignant), "% of malignant plus benignant")
print("Benignant: ", benignant, "cases.", benignant * 100 / (malignant + benignant), "% of malignant plus benignant\n")
print("Benignant without call back: ", benignant_wo_call)

Total of training cases:  2443
Total of benign + malignant cases:  2123 

Malignant:  1098 cases. 51.71926519076778 % of malignant plus benignant
Benignant:  1025 cases. 48.28073480923222 % of malignant plus benignant

Benignant without call back:  320


In [293]:
total = len(test_full["pathology"])
malignant = len(test_full["pathology"][test_full["pathology"] == "MALIGNANT"])
benignant = len(test_full["pathology"][test_full["pathology"] == "BENIGN"])
benignant_wo_call = len(test_full["pathology"][test_full["pathology"] == "BENIGN_WITHOUT_CALLBACK"])

print("Total of testing cases: ", total)
print("Total of benign + malignant cases: ", malignant + benignant, "\n")
print("Malignant: ", malignant, "cases.", malignant * 100 / (malignant + benignant), "% of malignant plus benignant")
print("Benignant: ", benignant, "cases.", benignant * 100 / (malignant + benignant), "% of malignant plus benignant\n")
print("Benignant without call back: ", benignant_wo_call)

Total of testing cases:  641
Total of benign + malignant cases:  562 

Malignant:  260 cases. 46.263345195729535 % of malignant plus benignant
Benignant:  302 cases. 53.736654804270465 % of malignant plus benignant

Benignant without call back:  79


Due to class imbalnce the of benign without call back that class will be removed therefor the train and test set will be a binary classification of malignant vs benignant pathology.

In [294]:
# removes the "beign without callback" cases
train_full = train_full[train_full["pathology"] != "BENIGN_WITHOUT_CALLBACK"]
test_full = test_full[test_full["pathology"] != "BENIGN_WITHOUT_CALLBACK"]


In [296]:
test_full["pathology"]

3        MALIGNANT
4           BENIGN
9        MALIGNANT
24          BENIGN
27       MALIGNANT
           ...    
10205    MALIGNANT
10208    MALIGNANT
10214       BENIGN
10218    MALIGNANT
10219       BENIGN
Name: pathology, Length: 562, dtype: object

###  Image Size

In [297]:
# Gets image sizes
from PIL import Image

sizes_train = []
for path in train_full["image_path"]:
    with Image.open(path) as img:
        width, height = img.size
        sizes_train.append([width, height])

In [282]:
sizes_train[:5]

[[3826, 6601], [2952, 4504], [3480, 5992], [3166, 6256], [2896, 4504]]

### Save new organized data in Dataframe
Save full images data to csv files to be used in model training

In [298]:
train_full.columns

Index(['image_id', 'image_type', 'image_path', 'series_uid', 'subject_id',
       'study_uid', 'breast_density', 'breast_side', 'image_view',
       'abnormality_type', 'pathology', 'split'],
      dtype='object')

In [299]:
# save full images dta to csv files to be used in model training
train_full.to_csv("CBIS-DDSM_Clean_Data/train_full.csv", index=False)
test_full.to_csv("CBIS-DDSM_Clean_Data/test_full.csv", index=False)

### Save full images in new directory

In [None]:
# def save_full_images(dset):
#     # parent directory
#     parent_dir = "CBIS-DDSM_Clean_Data/full_images/"
#     os.makedirs(parent_dir, exist_ok=True)
    
#     for path in dset["image_path"]:
            
#         # loads image
#         image = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        
#         # creates path and saves image
#         file_name = path.split("/")[-1]
        
#         im_path = os.path.join(parent_dir, file_name)
#         cv2.imwrite(im_path, image)