# Environemnt Setup and Prototype

## Sprint 1: Project Setup and Prototype

### 1.1 Data Basic Preporcessing

#### 1.1.1 Import Libraries

In [12]:
# Import libraries
import pandas as pd
import os

#### 1.1.2 Load Data
The CBIS-DDSM datset contains: 
- A metadata file in csv format
- Four descriptions files in csv format
- One directory with images stored in directories with a path based on the description of the case.

In [13]:
# Load data from metadata file
meta_path = 'CBIS-DDSM_Data/images_dataset/metadata.csv' 
metadata_df = pd.read_csv(meta_path)

# Load data from csv data files for training and testing sets
mass_train_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_train_set.csv")
mass_test_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_test_set.csv")
calc_train_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_train_set.csv")
calc_test_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_test_set.csv")

Example of an image's path    

In [14]:
calc_test_orig["image file path"][0]

'Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009/1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992/000000.dcm'

#### 1.1.3 Match and Combine Description Data
Currently the mass and calcification description files have different names. The names are matched to combine files

In [15]:
# Columns current names and new names
col_new_names = ["breast_density", "breast_side", "image_view", "abnormality_type", "pathology", "image_path", "cropped_image_path", "ROI_mask_path"] 
mass_names = ["breast_density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]
calc_names = ["breast density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]

In [16]:
# Rename columns
mass_train_renamed = mass_train_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
mass_test_renamed = mass_test_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
calc_train_renamed = calc_train_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})
calc_test_renamed = calc_test_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})

In [17]:
# Print number of cases for file
print("Original Number of Cases:")
print(" - Mass trained cases: ", len(mass_train_renamed))
print(" - Mass trained cases: ", len(mass_test_renamed))
print(" - Calc trained cases: ", len(calc_train_renamed))
print(" - Calc trained cases: ", len(calc_test_renamed))

Original Number of Cases:
 - Mass trained cases:  1318
 - Mass trained cases:  378
 - Calc trained cases:  1546
 - Calc trained cases:  326


In [304]:
# Combine needed columns and print number of cases combining training and testing data
train_desc = []
test_desc = []
train_desc = pd.concat([mass_train_renamed[col_new_names], calc_train_renamed[col_new_names]], ignore_index=True)
test_desc = pd.concat([mass_test_renamed[col_new_names], calc_test_renamed[col_new_names]], ignore_index=True)
print("New Number of Cases per Set:")
print(" - Train cases: ", len(train_desc))
print(" - Test cases: ", len(test_desc))
print("\n", train_desc.columns)

New Number of Cases per Set:
 - Train cases:  2864
 - Test cases:  704

 Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path'],
      dtype='object')


#### 1.1.4 Extract series UIDs from paths 
The decompressed images are stored in one directory where the name is the series ID number generated using the metadata file. To connect the description with its respective image already decompresed, the series UIDs are extracted from paths from descriptions files to match with images stored in images_png 

In [306]:
# Function for extracting description series UIDs from paths stored in the description files 
def get_series_from_descriptions(im_types, new_cols, train, test, new_train, new_test):
    for im_type, col in zip(im_types, new_cols):
        train_set = [serie.split("/")[2] for serie in train[im_type]]
        test_set = [serie.split("/")[2] for serie in test[im_type]]
        new_train[col] = train_set
        new_test[col] = test_set
    return new_train, new_test

In [307]:
# Extract description series UIDs from paths stored in the description files 
train_desc, test_desc = get_series_from_descriptions(["image_path", "cropped_image_path", "ROI_mask_path"], 
                                                     ["series_uid_full", "series_uid_crop", "series_uid_roi"], 
                                                     train_orig, test_orig, train_desc, test_desc)

In [308]:
# Examples of the description of two cases. The new columns are included. 
print("Two examples of description data cases with series_uid included: \n")
print(train_desc.iloc[0], "\n")
print(train_desc.iloc[len(train_desc)-1])

Two examples of description data cases with series_uid included: 

breast_density                                                        3
breast_side                                                        LEFT
image_view                                                           CC
abnormality_type                                                   mass
pathology                                                     MALIGNANT
image_path            Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...
cropped_image_path    Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
ROI_mask_path         Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
series_uid_full       1.3.6.1.4.1.9590.100.1.2.342386194811267636608...
series_uid_crop       1.3.6.1.4.1.9590.100.1.2.296736403313792599626...
series_uid_roi        1.3.6.1.4.1.9590.100.1.2.296736403313792599626...
Name: 0, dtype: object 

breast_density                                                        1
breast_side                                 

In [350]:
for img1, img2 in zip(train_desc["series_uid_crop"], train_desc["series_uid_roi"]):
    if img1 != img2:
        print(img1)
        print(img2)
        print("------")

1.3.6.1.4.1.9590.100.1.2.11686230612923981409805359571305102767
1.3.6.1.4.1.9590.100.1.2.203970065112706924324370021454096282600
------
1.3.6.1.4.1.9590.100.1.2.48416062411306564131997923171520794902
1.3.6.1.4.1.9590.100.1.2.213479805111756800235140829670343680695
------
1.3.6.1.4.1.9590.100.1.2.132003408513396862636923368121361942631
1.3.6.1.4.1.9590.100.1.2.167721256311930226218015781691859713018
------
1.3.6.1.4.1.9590.100.1.2.35972224112655501315001155893333671981
1.3.6.1.4.1.9590.100.1.2.312064575313543515220300745352630857321
------
1.3.6.1.4.1.9590.100.1.2.302043706512531689934596629952753769958
1.3.6.1.4.1.9590.100.1.2.278022131711751825320356179721710838376
------
1.3.6.1.4.1.9590.100.1.2.219157189212113616029573495392380980692
1.3.6.1.4.1.9590.100.1.2.62477624313950073629115651880107415441
------
1.3.6.1.4.1.9590.100.1.2.235208716612742186319958432593634355989
1.3.6.1.4.1.9590.100.1.2.11522965912391185905725226742050551997
------
1.3.6.1.4.1.9590.100.1.2.186327494411516593004

#### 1.1.5 Gets images new locations and make a new data frame 
Create a new data frame with information of each image.For each image saved: 
- extract series_uid
- find its respective information in the description (train/test) sets
- find its respective metadata. 

In [98]:
# Extracts paths from images stored in images_png directory 
path_to_imgs = "CBIS-DDSM_Clean_Data/images_png/"
images_paths = os.listdir(path_to_imgs)


print("Number of images in total:")
print(" - ", len(images_paths))

print("\nExamples of PNG images paths:")
images_paths[:2]

Number of images in total:
 -  10239

Examples of PNG images paths:


['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132_1-1.png',
 '1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132_1-2.png']

In [99]:
# creates a list of series UIDs by removing file extension and last 3 digits 
images_series = [path[:-8] for path in images_paths]
images_series[:3]

['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547']

In [100]:
# verify the series are unique
print("Unique paths:", len(set(images_paths)), "out of ", len(images_paths))
print("Unique series:", len(set(images_series)), "out of ", len(images_series))

Unique paths: 10239 out of  10239
Unique series: 6775 out of  10239


#### 1.1.6 Extract metadata
As we can see in the folowing cell, some series cantain more than one images. For that reason, the path can be used to diferenciate images

In [309]:
# Function for matching images paths to metadata 
def get_images_metadata(images_series, metadata_series, metadata_col, col_type=False):
    new_col = []
    # uses description series uids
    for serie in images_series:
        # makes sure the series is in the metadata
        if serie in list(metadata_series):
            index = list(metadata_series).index(serie)
            new_col.append(metadata_col[index])
        else: 
            new_col.append(None)
            print(serie, "not found")
    if col_type:
        new_col = pd.Series(new_col)
        new_col = new_col.replace("full mammogram images", "full")
        new_col = new_col.replace("ROI mask images", "ROI")
        new_col = new_col.replace("cropped images", "cropped")
    return new_col
            

In [310]:
# Create dataframe
images_data = pd.DataFrame()

# Adds metadata to new dataframe
images_data["image_id"] = [path[-13:-4] for path in images_paths]
images_data["image_path"] = images_paths
images_data["series_uid"] = images_series
images_data["subject_id"] = get_images_metadata(images_series, metadata_df["Series UID"], metadata_df["Subject ID"])
images_data["study_uid"] = get_images_metadata(images_series, metadata_df["Series UID"], metadata_df["Study UID"])
images_data["image_type"] = get_images_metadata(images_series, metadata_df["Series UID"], metadata_df["Series Description"], col_type=True)

In [347]:
images_data.iloc[:10]

Unnamed: 0,image_id,image_path,series_uid,subject_id,study_uid,image_type
0,91132_1-1,1.3.6.1.4.1.9590.100.1.2.100018879311824535125...,1.3.6.1.4.1.9590.100.1.2.100018879311824535125...,Calc-Training_P_01128_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.346810468613236696616...,ROI
1,91132_1-2,1.3.6.1.4.1.9590.100.1.2.100018879311824535125...,1.3.6.1.4.1.9590.100.1.2.100018879311824535125...,Calc-Training_P_01128_RIGHT_CC_1,1.3.6.1.4.1.9590.100.1.2.346810468613236696616...,ROI
2,51547_1-1,1.3.6.1.4.1.9590.100.1.2.100131208110604806117...,1.3.6.1.4.1.9590.100.1.2.100131208110604806117...,Calc-Training_P_01107_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.113816182611334006337...,full
3,62869_1-1,1.3.6.1.4.1.9590.100.1.2.100522099512256189513...,1.3.6.1.4.1.9590.100.1.2.100522099512256189513...,Mass-Test_P_00576_LEFT_MLO,1.3.6.1.4.1.9590.100.1.2.286066835611001826026...,full
4,45647_1-1,1.3.6.1.4.1.9590.100.1.2.100522676511025180541...,1.3.6.1.4.1.9590.100.1.2.100522676511025180541...,Mass-Test_P_01510_RIGHT_MLO,1.3.6.1.4.1.9590.100.1.2.403479789212123359501...,full
5,41962_1-1,1.3.6.1.4.1.9590.100.1.2.100552942107662341409...,1.3.6.1.4.1.9590.100.1.2.100552942107662341409...,Calc-Training_P_00539_RIGHT_MLO_3,1.3.6.1.4.1.9590.100.1.2.735487009117821327380...,ROI
6,41962_1-2,1.3.6.1.4.1.9590.100.1.2.100552942107662341409...,1.3.6.1.4.1.9590.100.1.2.100552942107662341409...,Calc-Training_P_00539_RIGHT_MLO_3,1.3.6.1.4.1.9590.100.1.2.735487009117821327380...,ROI
7,99884_1-1,1.3.6.1.4.1.9590.100.1.2.100579676611077714807...,1.3.6.1.4.1.9590.100.1.2.100579676611077714807...,Calc-Training_P_00685_RIGHT_CC,1.3.6.1.4.1.9590.100.1.2.137434121012998881531...,full
8,34166_1-1,1.3.6.1.4.1.9590.100.1.2.100631678311747240317...,1.3.6.1.4.1.9590.100.1.2.100631678311747240317...,Calc-Training_P_01397_LEFT_CC,1.3.6.1.4.1.9590.100.1.2.333224815513785648507...,full
9,82046_1-1,1.3.6.1.4.1.9590.100.1.2.100632214012866120117...,1.3.6.1.4.1.9590.100.1.2.100632214012866120117...,Mass-Test_P_01294_RIGHT_MLO,1.3.6.1.4.1.9590.100.1.2.521002057126853588417...,full


In [311]:
# Function for matching images paths to description data 
def get_images_description(new_col, images_series, image_types, descr_series_cols, descr_col):

    # uses description series uids
    for i, serie, im_type in zip(range(len(images_series)), images_series, image_types):
        # makes sure the series is in the descriptions data 
        if im_type == "full": 
            indices = descr_series_cols[0][descr_series_cols[0] == serie].index.tolist()
        elif im_type == "cropped":
            indices = descr_series_cols[1][descr_series_cols[1] == serie].index.tolist()
        elif im_type == "ROI":
            indices = descr_series_cols[2][descr_series_cols[2] == serie].index.tolist()
        # Makes sure the data is identical in the rows found
        if len(indices) >= 1:
            cell_vals = []
            for val in indices:
                cell_vals.append(descr_col[val])
                
            # checks for unique values for each serie
            if len(set(cell_vals)) == 1:
                new_col[i] = list(set(cell_vals))[0]
            

    # # make sure the length of items in series and in new column is the same
    # if len(images_series) == len(new_col):
    #     print("Length matches")
    # else:
    #     print("Error")
    #     print(len(images_series), len(new_col))
        
    return new_col

In [313]:
# Adds Description data to new dataframe
col_names = ["breast_density", "breast_side", "image_view", "abnormality_type", "pathology", "set_name"] 
new_col = [None] * len(images_series)
new_col = get_images_description(new_col,
                                 images_series,
                                 images_data["image_type"],
                                 [train_desc["series_uid_full"], train_desc["series_uid_crop"], train_desc["series_uid_roi"]],
                                 train_desc["breast_density"] 
                                )

In [315]:
new_col = get_images_description(new_col, 
                           images_series, 
                           images_data["image_type"], 
                           [test_desc["series_uid_full"], test_desc["series_uid_crop"], test_desc["series_uid_roi"]], 
                           test_desc["breast_density"])

In [317]:
import numpy as np
print(np.sum(np.array(new_col) == None))
print(np.where(np.array(new_col) == None))

2
(array([4765, 5969]),)


In [341]:
print(images_data.iloc[4765]["series_uid"])
print(images_data.iloc[5969]["series_uid"])

1.3.6.1.4.1.9590.100.1.2.27611247910023474515584644283556391565
1.3.6.1.4.1.9590.100.1.2.323173986211744534717152497940376023803


In [332]:
test_desc["series_uid_crop"][1]

'1.3.6.1.4.1.9590.100.1.2.381440141511137044327302306604206077287'

In [346]:
'1.3.6.1.4.1.9590.100.1.2.27611247910023474515584644283556391565' in list(train_desc["series_uid_crop"])

True

In [345]:
list(train_desc["series_uid_crop"]).index('1.3.6.1.4.1.9590.100.1.2.27611247910023474515584644283556391565')

1662

In [276]:
print(len(a), len(images_series))

10239 10239


In [119]:
# find images for each data set
train_paths = get_sets_images(train_orig, series_to_imgs, paths_to_files)
test_paths = get_sets_images(test_orig, series_to_imgs, paths_to_files)
print(len(train_paths), "images were found out of ", len(train_orig))
print(len(test_paths), "images were found out of ", len(test_orig))

2864 images were found out of  2864
704 images were found out of  704


In [120]:
train_orig.columns

Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path',
       'series_uid', 'images_new_paths', 'image_type'],
      dtype='object')

In [121]:
# add paths to train and test sets
train_orig["images_new_paths"] = train_paths
test_orig["images_new_paths"] = test_paths

### Adds series description(ROI Mask or Full Mammogram) to dataframe

In [122]:
# function for matching data sets series to images paths
def get_sets_desc(set_series, meta_series, descriptions):
    descriptions_list = []
    for s_serie in set_series["series_uid"]:
        index = meta_series.to_list().index(s_serie)
        if index > -1:
            descriptions_list.append(descriptions[index])
        else: 
            print(s_serie, "not found")
    return descriptions_list

In [123]:
# find descriptions for each data set
train_descs = get_sets_desc(train_orig, metadata_df["Series UID"], metadata_df["Series Description"])
test_descs = get_sets_desc(test_orig, metadata_df["Series UID"], metadata_df["Series Description"])
print(len(train_descs), "descriptions were found out of ", len(train_orig))
print(len(test_descs), "descriptions were found out of ", len(test_orig))

2864 descriptions were found out of  2864
704 descriptions were found out of  704


In [124]:
# add paths to train and test sets
train_orig["image_type"] = train_descs
test_orig["image_type"] = test_descs

### Save new organized data in Dataframe

In [125]:
train_orig.columns

Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path',
       'series_uid', 'images_new_paths', 'image_type'],
      dtype='object')

In [126]:
cols_to_save = ['breast_density', 'breast_side', 'image_view', 'abnormality_type',
               'pathology', 'series_uid', 'images_new_paths', 'image_type']
train_orig[cols_to_save].to_csv("CBIS-DDSM_Clean_Data/train_descriptions.csv", index=False)
test_orig[cols_to_save].to_csv("CBIS-DDSM_Clean_Data/test_descriptions.csv", index=False)