### Import Libraries for Data Preprocessing

In [1]:
# import libraries
import pandas as pd
import pydicom       # for reading and writing DICOM files
import numpy as np
import matplotlib.pyplot as plt
import cv2

### Read metadata 

In [2]:
# open metadata file
path = 'CBIS-DDSM_Data/images_dataset/metadata.csv' 
metadata_df = pd.read_csv(path)

# show content
# print(metadata_df.head(2))

In [3]:
# print the columns of the metadata
print(metadata_df.columns)

Index(['Series UID', 'Collection', '3rd Party Analysis',
       'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
       'Study Date', 'Series Description', 'Manufacturer', 'Modality',
       'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
       'File Location', 'Download Timestamp'],
      dtype='object')


In [4]:
dirs = metadata_df["File Location"].str.replace("\\", "/", regex=False)
dirs[:3] 

0    ./CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/08-29-...
1    ./CBIS-DDSM/Calc-Test_P_00038_LEFT_MLO/08-29-2...
2    ./CBIS-DDSM/Calc-Test_P_00038_LEFT_CC/08-29-20...
Name: File Location, dtype: object

In [5]:
dirs[0]

'./CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/08-29-2017-DDSM-NA-94942/1.000000-ROI mask images-18515'

In [6]:
series = metadata_df["Series UID"]

### Access images
- Find and build paths to download data

In [7]:
import os

# path_to_dir = "/CBIS-DDSM_Data/images_dataset/" + metadata_df["File Location"][0]  
main_dir = 'CBIS-DDSM_Data/images_dataset/'

# Adds main directory parent path 
dirs_paths = [main_dir + d for d in dirs]
print("Number of directories with images", len(dirs_paths))
print(dirs_paths[0])

# Access each file in the directory 
files_per_dir = [os.listdir(path) for path in dirs_paths]
files_per_dir

Number of directories with images 6775
CBIS-DDSM_Data/images_dataset/./CBIS-DDSM/Calc-Test_P_00038_LEFT_CC_1/08-29-2017-DDSM-NA-94942/1.000000-ROI mask images-18515


[['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm'],
 ['1-1.dcm', '1-2.dcm'],
 ['1-1.dcm', '1-2.dc

In [8]:
# Test: try decompresing and accesing one file
# If file was access, no error should be printed
full_path = dirs_paths[1] + "/" + files_per_dir[0][0]
ds = pydicom.dcmread(full_path)
print(ds.file_meta.TransferSyntaxUID.name)

Implicit VR Little Endian


### Extract Files 
Uses the data from the metadata file for extracting and decompresing images.

- Paths to images files
- The number of files that each directory contain. This help to extract ROIs and Cropped images
- The series to use as a name. However, for each series there may be more than one file. Therefore a termination to identify each image will be added.  

In [None]:
# Iterate through files for extracting images and saves them in directories with the  series UIDs
from pydicom.pixel_data_handlers.util import apply_voi_lut

# make images directory
output_dir = "CBIS-DDSM_Clean_Data/images_png/"
os.makedirs(output_dir, exist_ok=True)

# iterate paths, files and series arrays
for path, files, serie in zip(dirs_paths, files_per_dir, series):
    # iterate files for each directory of series 
    for file in files:
        # concatenate path and file name
        full_path = path + "/" + file
        
        new_file_name = serie + "_" + file.replace(".dcm", ".png")

        # read dicom file data 
        img_ds = pydicom.dcmread(full_path)

        # create pixel arrays
        pixels_arr = apply_voi_lut(img_ds.pixel_array, img_ds)

        # Normalize array
        img_pixels = (pixels_arr - pixels_arr.min()) / (pixels_arr.max() - pixels_arr.min()) * 255

        # Assigns type of array
        img_pixels = img_pixels.astype("uint8")

        # Creates file
        cv2.imwrite(os.path.join(output_dir, new_file_name), img_pixels)

        