# Environemnt Setup and Prototype

## Sprint 1: Project Setup and Prototype

### 1.1 Data Basic Preporcessing

#### 1.1.1 Import Libraries

In [1]:
# Import libraries
import pandas as pd
import os

#### 1.1.2 Load Data
The CBIS-DDSM datset contains: 
- A metadata file in csv format
- Four descriptions files in csv format
- One directory with images stored in directories with a path based on the description of the case.

In [3]:
# Load data from metadata file
meta_path = 'CBIS-DDSM_Data/images_dataset/metadata.csv' 
metadata_df = pd.read_csv(meta_path)

# Load data from csv data files for training and testing sets
mass_train_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_train_set.csv")
mass_test_orig = pd.read_csv("CBIS-DDSM_Data/mass_case_description_test_set.csv")
calc_train_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_train_set.csv")
calc_test_orig = pd.read_csv("CBIS-DDSM_Data/calc_case_description_test_set.csv")

Example of an image's path    

In [4]:
calc_test_orig["image file path"][0]

'Calc-Test_P_00038_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.85935434310203356712688695661986996009/1.3.6.1.4.1.9590.100.1.2.374115997511889073021386151921807063992/000000.dcm'

#### 1.1.3 Match and Combine Data
Currently the mass and calcification description files have different names. The names are matched to combine files

In [5]:
# Columns current names and new names
col_new_names = ["breast_density", "breast_side", "image_view", "abnormality_type", "pathology", "image_path", "cropped_image_path", "ROI_mask_path"] 
mass_names = ["breast_density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]
calc_names = ["breast density", "left or right breast", "image view", "abnormality type", "pathology", "image file path", "cropped image file path", "ROI mask file path"]

In [6]:
# Rename columns
mass_train_renamed = mass_train_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
mass_test_renamed = mass_test_orig.rename(columns={name:new_name for name, new_name in zip(mass_names, col_new_names)})
calc_train_renamed = calc_train_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})
calc_test_renamed = calc_test_orig.rename(columns={name:new_name for name, new_name in zip(calc_names, col_new_names)})

In [7]:
# Print number of cases for file
print("Original Number of Cases:")
print(" - Mass trained cases: ", len(mass_train_renamed))
print(" - Mass trained cases: ", len(mass_test_renamed))
print(" - Calc trained cases: ", len(calc_train_renamed))
print(" - Calc trained cases: ", len(calc_test_renamed))

Original Number of Cases:
 - Mass trained cases:  1318
 - Mass trained cases:  378
 - Calc trained cases:  1546
 - Calc trained cases:  326


In [8]:
# Combine needed columns and print number of cases combining training and testing data
train_orig = pd.concat([mass_train_renamed[col_new_names], calc_train_renamed[col_new_names]], ignore_index=True)
test_orig = pd.concat([mass_test_renamed[col_new_names], calc_test_renamed[col_new_names]], ignore_index=True)
print("New Number of Cases per Set:")
print(" - Train cases: ", len(train_orig))
print(" - Test cases: ", len(test_orig))
print("\n", train_orig.columns)

New Number of Cases per Set:
 - Train cases:  2864
 - Test cases:  704

 Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path'],
      dtype='object')


#### 1.1.4 Extract series UIDs from paths 
The decompressed images are stored in one directory where the name is the series ID number generated using the metadata file. To connect the description with its respective image already decompresed, the series UIDs are extracted from paths from descriptions files to match with images stored in images_png 

In [9]:
# Extract description series UIDs from paths stored in the description files 
train_series_paths = train_orig["image_path"]
test_series_paths = test_orig["image_path"]

train_series_list = [serie.split("/")[2] for serie in train_series_paths]
test_series_list = [serie.split("/")[2] for serie in test_series_paths]
print("Example of series UID extracted from path:")
print(" - Train path: ", train_series_paths[0])
print(" - Train serie: ", train_series_list[0])
print()
print(" - Test path: ", test_series_paths[0])
print(" - Test serie: ", test_series_list[0])

Example of series UID extracted from path:
 - Train path:  Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.422112722213189649807611434612228974994/1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515/000000.dcm
 - Train serie:  1.3.6.1.4.1.9590.100.1.2.342386194811267636608694132590482924515

 - Test path:  Mass-Test_P_00016_LEFT_CC/1.3.6.1.4.1.9590.100.1.2.416403281812750683720028031170500130104/1.3.6.1.4.1.9590.100.1.2.245063149211255120613007755642780114172/000000.dcm
 - Test serie:  1.3.6.1.4.1.9590.100.1.2.245063149211255120613007755642780114172


In [10]:
# Concatenate extracted series UID to train and test sets 
train_orig["series_uid"] = train_series_list 
test_orig["series_uid"] = test_series_list
print("Two examples of description data cases with series_uid included: \n")
print(train_orig.iloc[0], "\n")
print(train_orig.iloc[2000])

Two examples of description data cases with series_uid included: 

breast_density                                                        3
breast_side                                                        LEFT
image_view                                                           CC
abnormality_type                                                   mass
pathology                                                     MALIGNANT
image_path            Mass-Training_P_00001_LEFT_CC/1.3.6.1.4.1.9590...
cropped_image_path    Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
ROI_mask_path         Mass-Training_P_00001_LEFT_CC_1/1.3.6.1.4.1.95...
series_uid            1.3.6.1.4.1.9590.100.1.2.342386194811267636608...
Name: 0, dtype: object 

breast_density                                                        3
breast_side                                                       RIGHT
image_view                                                           CC
abnormality_type                            

#### 1.1.5 Gets images locations for train and test sets
Get series_uids from training and testing sets from description csv files.  Finds the respective image using the series_uid in the metadata csv file 

In [11]:
# Extracts paths from images stored in images_png directory 
path_to_imgs = "CBIS-DDSM_Clean_Data/images_png/"
paths_to_files = os.listdir(path_to_imgs)[1:]

print("Number of images in total:")
print(" - ", len(paths_to_files))

print("\nExamples of PNG images paths:")
paths_to_files[:3]

Number of images in total:
 -  7511

Examples of PNG images paths:


['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132_1-2.png',
 '1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547_1-1.png',
 '1.3.6.1.4.1.9590.100.1.2.100522099512256189513864912954167862869_1-1.png']

In [98]:
# creates a list of series UIDs by removing file extension and last 3 digits 
series_to_imgs = [path[:-7] for path in paths_to_files]
series_to_imgs[:3]

['1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100018879311824535125115145152454291132',
 '1.3.6.1.4.1.9590.100.1.2.100131208110604806117271735422083351547']

In [118]:
# Function for matching images paths to data sets series !!!!!!!!!!!!!!!!!
def get_images_description(meta_series, paths, desc_series):
    img_locs = []
    # uses description series uids
    for d_serie in desc_series["series_uid"]:
        # makes sure the series is in the metadata
        if d_serie in meta_series:
            index = meta_series.index(d_serie)
            img_locs.append("CBIS-DDSM_Clean_Data/images_png/" + paths[index])
        else: 
            print(d_series, "not found")
    return img_locs
            

In [118]:
# Function for matching data sets series to images paths
def get_sets_images(desc_series, meta_series, paths):
    img_locs = []
    # uses description series uids
    for d_serie in desc_series["series_uid"]:
        # makes sure the series is in the metadata
        if d_serie in meta_series:
            index = meta_series.index(d_serie)
            img_locs.append("CBIS-DDSM_Clean_Data/images_png/" + paths[index])
        else: 
            print(d_series, "not found")
    return img_locs
            

In [119]:
# find images for each data set
train_paths = get_sets_images(train_orig, series_to_imgs, paths_to_files)
test_paths = get_sets_images(test_orig, series_to_imgs, paths_to_files)
print(len(train_paths), "images were found out of ", len(train_orig))
print(len(test_paths), "images were found out of ", len(test_orig))

2864 images were found out of  2864
704 images were found out of  704


In [120]:
train_orig.columns

Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path',
       'series_uid', 'images_new_paths', 'image_type'],
      dtype='object')

In [121]:
# add paths to train and test sets
train_orig["images_new_paths"] = train_paths
test_orig["images_new_paths"] = test_paths

### Adds series description(ROI Mask or Full Mammogram) to dataframe

In [122]:
# function for matching data sets series to images paths
def get_sets_desc(set_series, meta_series, descriptions):
    descriptions_list = []
    for s_serie in set_series["series_uid"]:
        index = meta_series.to_list().index(s_serie)
        if index > -1:
            descriptions_list.append(descriptions[index])
        else: 
            print(s_serie, "not found")
    return descriptions_list

In [123]:
# find descriptions for each data set
train_descs = get_sets_desc(train_orig, metadata_df["Series UID"], metadata_df["Series Description"])
test_descs = get_sets_desc(test_orig, metadata_df["Series UID"], metadata_df["Series Description"])
print(len(train_descs), "descriptions were found out of ", len(train_orig))
print(len(test_descs), "descriptions were found out of ", len(test_orig))

2864 descriptions were found out of  2864
704 descriptions were found out of  704


In [124]:
# add paths to train and test sets
train_orig["image_type"] = train_descs
test_orig["image_type"] = test_descs

### Save new organized data in Dataframe

In [125]:
train_orig.columns

Index(['breast_density', 'breast_side', 'image_view', 'abnormality_type',
       'pathology', 'image_path', 'cropped_image_path', 'ROI_mask_path',
       'series_uid', 'images_new_paths', 'image_type'],
      dtype='object')

In [126]:
cols_to_save = ['breast_density', 'breast_side', 'image_view', 'abnormality_type',
               'pathology', 'series_uid', 'images_new_paths', 'image_type']
train_orig[cols_to_save].to_csv("CBIS-DDSM_Clean_Data/train_descriptions.csv", index=False)
test_orig[cols_to_save].to_csv("CBIS-DDSM_Clean_Data/test_descriptions.csv", index=False)