# The DICOM to PNG Cropped Notebook

##### This notebook splits the cancer and no cancer classes further into craniocaudal (cc) and mediolateral oblique (MLO) views.
##### Ensure the views correlate. Meaning for each cc view of patient x, there is a mlo view of patient x. ##### Applies pre-processing techniques and crops the images to the breast region.


## Import the Relevant Libraries

In [1]:
# Allows for the access to the image directories
import os

In [2]:
# Allows for me to work with the DICOM files
import pydicom

In [3]:
# Allows me to work with and manipulate the image array
import numpy as np

In [4]:
# Allows me to apply pre-processing techniques to the images
import cv2

In [5]:
# Allows for dataframe creation as well as manipulation and cleaning of the data
import pandas as pd

## Import and Read the CSV File - Cancer 

In [6]:
# Metadata of the training images - cancer
cancer_meta = pd.read_csv("new_sorted_cancer_df.csv")

In [7]:
# Display the first 2 rows of the metadata
cancer_meta.head(2)

Unnamed: 0,patient_image_id,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,10130_1360338805,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False
1,10130_1672636630,1,10130,1672636630,L,MLO,71.0,1,1,1,0.0,0,B,49,False


In [8]:
# Get unique values of the view column
unique_views_cancer = cancer_meta["view"].unique()

# Print the views of the mammography images in use 
print(unique_views_cancer)

['CC' 'MLO' 'AT']


## Import and Read the CSV File - No Cancer 

In [9]:
# Metadata of the training images - no cancer
no_cancer_meta = pd.read_csv("new_sorted_non_cancer_df.csv")

In [10]:
# Display the lats 2 rows of the metadata
no_cancer_meta.tail(2)

Unnamed: 0,patient_image_id,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
2263,9549_760279512,1,9549,760279512,L,MLO,48.0,0,0,0,2.0,0,A,49,False
2264,9549_856218783,1,9549,856218783,R,CC,48.0,0,0,0,2.0,0,A,49,False


In [11]:
# Get unique values of the view column
unique_views_no_cancer = no_cancer_meta["view"].unique()

# Print the views of the mammography images in use 
print(unique_views_no_cancer)

['CC' 'MLO' 'LM' 'AT']


## Compile Cancer and No Cancer ID Lists with Views

### Cancer

#### CC - CranioCaudal View

In [12]:
# Filter the cancer dataframe by the CC view
cancer_CC = cancer_meta[cancer_meta["view"] == "CC"]

In [13]:
# Assign the patient_image_id with a CC view to a list
cancer_CC_list = cancer_CC["patient_image_id"].tolist()

In [14]:
# Print the length of the list
print(len(cancer_CC_list))

566


#### MLO - MedioLateral Oblique View

In [15]:
cancer_MLO = cancer_meta[cancer_meta["view"] == "MLO"]

In [16]:
# Assign the patient_image_id with a MLO view to a list
cancer_MLO_list = cancer_MLO["patient_image_id"].tolist()

In [17]:
# Print the length of the list
print(len(cancer_MLO_list))

590


#### AT - Axillary Tail View

In [18]:
# Filter the cancer dataframe by the CC view
cancer_AT = cancer_meta[cancer_meta["view"] == "AT"]

In [19]:
# Assign the patient_image_id with a CC view to a list
cancer_AT_list = cancer_AT["patient_image_id"].tolist()

In [20]:
# Print the length of the list
print(len(cancer_AT_list))

2


###### The Axillary Tail View only consists of two images, so these images will not be part of the dataset used in the CNN.

#### Equal Lengths CC and MLO View with Patient ID Correspondence

In [21]:
# New cancer_CC_list of length ?
c_CC_list = []
# New cancer_MLO_list of length ?
c_MLO_list = []

# Iterate through each id in cancer_CC_list
for cc_id in cancer_CC_list:
    # Get the patient id 
    cc_patient_id, cc_image_id = cc_id.split("_")
    
    # Iterate through each id in cancer_MLO_list
    for mlo_id in cancer_MLO_list:
        # Get the patient id 
        mlo_patient_id, mlo_image_id = mlo_id.split("_")

        # Compare the id's
        if cc_patient_id == mlo_patient_id and mlo_id not in c_MLO_list and cc_id not in c_CC_list:
            c_CC_list.append(cc_id)
            c_MLO_list.append(mlo_id)
            # Break the inner loop and move to the next cc_id
            break  

In [22]:
c_CC_list[:5]

['10130_1360338805',
 '10130_613462606',
 '10226_530620473',
 '1025_773597682',
 '10432_458553546']

In [23]:
c_MLO_list[:5]

['10130_1672636630',
 '10130_388811999',
 '10226_461614796',
 '1025_1803952236',
 '10432_1434858530']

In [24]:
count = 0 

# Iterate through each id in both views
for cc_id, mlo_id in zip(c_CC_list, c_MLO_list):
    # Get the patient id 
    cc_patient_id, cc_image_id = cc_id.split("_")
    mlo_patient_id, mlo_image_id = mlo_id.split("_")

    # Check if they correlate
    if cc_patient_id == mlo_patient_id:
        count += 1

print(count)

518


In [25]:
print("Cancer - cc view: ", len(c_CC_list))
print("Cancer - mlo view: ", len(c_MLO_list))

Cancer - cc view:  518
Cancer - mlo view:  518


### No Cancer

#### CC - CranioCaudal View

In [26]:
# Filter the cancer dataframe by the CC view
no_cancer_CC = no_cancer_meta[no_cancer_meta["view"] == "CC"]

In [27]:
# Assign the patient_image_id with a CC view to a list
no_cancer_CC_list = no_cancer_CC["patient_image_id"].tolist()

In [28]:
# Print the length of the list
print(len(no_cancer_CC_list))

1125


#### MLO - MedioLateral Oblique View

In [29]:
no_cancer_MLO = no_cancer_meta[no_cancer_meta["view"] == "MLO"]

In [30]:
# Assign the patient_image_id with a MLO view to a list
no_cancer_MLO_list = no_cancer_MLO["patient_image_id"].tolist()

In [31]:
# Print the length of the list
print(len(no_cancer_MLO_list))

1136


#### Equal Lengths CC and MLO View with Patient ID Correspondence

In [32]:
# New no_cancer_CC_list of length ?
nc_CC_list = []
# New no_cancer_MLO_list of length ?
nc_MLO_list = []

# Iterate through each id in no_cancer_CC_list
for cc_id in no_cancer_CC_list:
    # Get the patient id 
    cc_patient_id, cc_image_id = cc_id.split("_")
    
    # Iterate through each id in no_cancer_MLO_list
    for mlo_id in no_cancer_MLO_list:
        # Get the patient id 
        mlo_patient_id, mlo_image_id = mlo_id.split("_")

        # Compare the id's
        if cc_patient_id == mlo_patient_id and mlo_id not in nc_MLO_list and cc_id not in nc_CC_list:
            nc_CC_list.append(cc_id)
            nc_MLO_list.append(mlo_id)
            # Break the inner loop and move to the next cc_id
            break 

In [33]:
nc_CC_list[:5]

['10095_1450760951',
 '10200_134267365',
 '10200_161478494',
 '10200_534538517',
 '10200_846834855']

In [34]:
nc_MLO_list[:5]

['10095_1854592291',
 '10200_1422756511',
 '10200_1620368761',
 '10200_570041037',
 '10200_600223902']

In [35]:
count = 0 

# Iterate through each id in both views
for cc_id, mlo_id in zip(nc_CC_list, nc_MLO_list):
    # Get the patient id 
    cc_patient_id, cc_image_id = cc_id.split("_")
    mlo_patient_id, mlo_image_id = mlo_id.split("_")

    # Check if they correlate
    if cc_patient_id == mlo_patient_id:
        count += 1

print(count)

971


In [36]:
print("No cancer - cc view: ", len(nc_CC_list))
print("No cancer - mlo view: ",len(nc_MLO_list))

No cancer - cc view:  971
No cancer - mlo view:  971


## Ensure No Doubles

In [37]:
# Add the cc and mlo list of cancer 
c_lst = c_CC_list + c_MLO_list

# Add the cc and mlo list of no cancer 
nc_lst = nc_CC_list + nc_MLO_list

In [38]:
# Combine all the lists
combined_list = c_lst + nc_lst

# Create a set from the combined list
combined_set = set(combined_list)

# Compare the lengths to check for duplicates
if len(combined_list) == len(combined_set):
    print("No duplicates found - each item is only in one list.")
else:
    print("Duplicates found - some items appear in multiple lists.")

No duplicates found - each item is only in one list.


## Defining the Directories

### Directory where RSNA DICOM Images are Saved

##### From

In [39]:
# Cancer
data_dir_c = "RSNA_Mammograms_Class/cancer/"

# No cancer
data_dir_nc = "RSNA_Mammograms_Class/no_cancer/"

### Directory where RSNA PNG Images will be Saved

##### To

In [40]:
# Cancer - cc view
out_dir_c_cc = "RSNA_PNG_Mammograms/cancer/cc/"

# Cancer - mlo view
out_dir_c_mlo = "RSNA_PNG_Mammograms/cancer/mlo/"


# No cancer - cc view
out_dir_nc_cc = "RSNA_PNG_Mammograms/no_cancer/cc/"

# No cancer - mlo view
out_dir_nc_mlo = "RSNA_PNG_Mammograms/no_cancer/mlo/"

## Converting the RSNA DICOM Images to PNG

# Functions for Pre-Processing

### MONOCHROME1 vs MONOCHROME2

-  If the photometric interpretation of the image is monochrome1, the pixel values gets inverted.
-  If the photometric interpretation of the image is monochrome2, the pixel values are normalised to start from zero.
-  If the photometric interpretation of the image is not one of the above, a value error is raised.

In [41]:
# Allows for a consistant interpretation
def photometric_interpretation(ds, ds_img):
    # Determine what the DICOM file’s PhotometricInterpretation is
    if ds.PhotometricInterpretation == "MONOCHROME1":
        # Adjust the image accordingly 
        return ds_img.max() - ds_img
    elif ds.PhotometricInterpretation == "MONOCHROME2":
        # Adjust the image accordingly 
        return ds_img - ds_img.min()
    else:
        # If no PhotometricInterpretation present, raise an error
        raise ValueError("Invalid Photometric Interpretation: {}" .format(scan.PhotometricInterpretation))

## Windowing

- The windowing function gets the relevant attribute values of the DICOM object.
- Adjusts the image array according to the attribute values. 
- And returns the adjusted image array.

In [42]:
# Allows for DICOM image windowing
def windowing(ds, ds_img):
    # Get the window center
    window_center = ds.WindowCenter
        
    # Get the window width
    window_width = ds.WindowWidth

    # If more than one value is present - window_center
    if isinstance(window_center, pydicom.multival.MultiValue):
        center = window_center[0]
    else:
        center = window_center

    # If more than one value is present - window_width
    if isinstance(window_width, pydicom.multival.MultiValue):
        width = window_width[0]
    else:
        width = window_width

    
    # Try and get pixel intensity relationship sign
    pixel_intensity = getattr(ds, "PixelIntensityRelationshipSign", "")    

    # Try and get rescale intercept
    rescale_intercept = getattr(ds, "RescaleIntercept", "")  

    # Try and get rescale slope
    rescale_slope = getattr(ds, "RescaleSlope", "") 

    # Try and get the VOI LUT Function
    function_lut = getattr(ds, "VOILUTFunction", "")
    
    
    # If present apply translation adjustments
    if (rescale_intercept != "" and rescale_slope != ""):
        img = (ds_img * rescale_slope + rescale_intercept)
    else:
        img = ds_img


    if function_lut == "SIGMOID":
        # Apply sigmoid function
        img = 1 / (1 + np.exp(-(img - center) / width))

    
    elif (pixel_intensity == -1):       
        # Apply the windowing formula
        min_value = (center - (width // 2))        # Minimum pixel intensity values
        max_value = (center + (width // 2))        # Maximum pixel intensity values
    
        # Set the min value for all minimum pixel intensity values less than the min_value 
        img[img < min_value] = min_value 
        # Set the max value for all minimum pixel intensity values higher than the max_value
        img[img > max_value] = max_value 
        
    else:
        # Set img to the original image - no windowing needed
        img = img

    
    # Return the windowed image
    return img

### Normalise Image Pixel Values

-  Different image arrays might have varying pixel intensity ranges.
-  To ensure consistent input ranges, the image array must be normalised
-  The values of the image array gets scale to a range of 0-255, which is the standard range for 8-bit images.
-  The image array then gets cast form a 16-bit, which is a common size for DICOM, into an 8-bit array.

In [43]:
# Normalise the image array
def normalise(ds_img):
    # Get the min value in the pixel array
    min_intensity = ds_img.min()
    # Get the max value in the pixel array
    max_intensity = ds_img.max()

    # Scale pixel values to the range [0, 255]
    normalised_pixels = ((ds_img - min_intensity) / (max_intensity - min_intensity)) * 255

    # Return the normalised pixels as 8-bits
    return normalised_pixels.astype(np.uint8)

### Determine Laterality

- The lateral_side function determines if the mammogram is of the left or right breast.
- It sums  the left and right halves of the image array. 
- The side with the greater sum of pixel values is considered to contain the breast tissue, since more pixels indicate more non-background content.
- Determines, which side holds the greater value and then sets the laterality value.

In [45]:
# Determines the side of the breast
def lateral_side(ds_img):
    # Compute the sum of columns for left and right halves 
    half = ds_img.shape[1] // 2
    left_col_sum = np.sum(ds_img[:, :half])
    right_col_sum = np.sum(ds_img[:, half:])
    
    # Determine which half is greater and set the laterality
    laterality = "L" if left_col_sum > right_col_sum else "R"

    # Return the laterality ("l" or "R")
    return laterality

### Crop the Image

- The crop_background function, ensures the image array is of unit 8, which is of type that is needed for cv2.
- After flipping the images to the left, if needed, a binary image where pixels greater than 5 are set to 255 and the rest are set to 0, is produced.
- This threshold image is then used to find the contours of the image.
- The x and y max values of the contour are determined with an extra margin added for safety.
- The image is cropped according to the x and y values extracted and returned.

In [46]:
# Crop the background 
def crop_background(ds, ds_img):
    # Image array for CV2
    img = ds_img.astype(np.uint8)

    
    # Determine laterality      
    laterality = lateral_side(img)

    # If the image is of the right side
    if laterality == "R":
        # Flip the image vertically
        img  = cv2.flip(img, 1)

    
    # Apply binary thresholding
    # Produces a binary image where pixels greater than 5 are set to 255 and the rest are set to 0
    thresh_img = cv2.threshold(img, 5, 1, cv2.THRESH_BINARY)[1]

    
    # Find contours
    contours = cv2.findContours(thresh_img , cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]

    # If contours present
    if contours:
        # Find the contour with the largest area
        contour = max(contours, key = cv2.contourArea)

        # Set the bounding box coordinates
        # Set the x1 and y1 values
        x1 = 0
        y1 = 0
        
        # Get the max width and height for contour
        x2 = np.max(contour[:, :, 0])
        y2 = np.max(contour[:, :, 1])
        
        # Apply a margin
        x2 = int(1.04 * x2)
        y2 = int(1.02 * y2)
        
        # Crop the image
        cropped_image = img[y1:y2, x1:x2]
               
    # Else return the original image
    else:
        # Return the original image if no contours found
        cropped_image = img 

    # Return the cropped image
    return cropped_image

### CLAHE 

-  To improve the overall contrast, histogram equalisation, a basic image processing technique, is applied.
-  This ensures that pixel values are more evenly distributed across the entire intensity range
-  Given that large intensity variations might be present, adaptive histogram equalisation (CLAHE) is used instead of global histogram equalisation.
-  CLAHE subdivides the image into smaller blocks and applies histogram equalisation to each block individually, with a contrast limit to avoid amplifying noise that may be present.

In [47]:
# Contrast Limited Adaptive Histogram Equalization (CLAHE)
def contrast_enhancing(ds_img):
    # Image array for CV2
    img = ds_img.astype(np.uint8)
    
    # Create a CLAHE object with default parameters
    clahe = cv2.createCLAHE(clipLimit = 0.8, tileGridSize = (16, 16))

    # Apply CLAHE to the grayscale image
    clahe_image = clahe.apply(img)

    # Return the enhanced image
    return clahe_image

## Using Pydicom Iterate Through the Directories

- The pre_process function takes 3 inputs, a specified list, specified source directory and specified destination directory.
- This a nested for loop for iteration of all the images in the specified list.
- The source and destination image paths are created with the help of the patient_image_id’s that is contained within the lists.
- All images is read with pydicom and the above described functions applied.
- The modified image arrays are then written, with the help of CV2, to the predefined destination paths.

In [48]:
# Function to copy each cancer  or no cancer image lists to a specified folder 
def pre_process(spes_list, spes_data_dir, folder_path):
    # Iterate through the determined folder
    for img in spes_list:
        # Set the source path for each dicom image
        img_path = spes_data_dir + img + ".dcm"
    
        # Set the path for saving
        out_path = folder_path + img + ".png"
    
        # Read the DICOM image
        ds = pydicom.dcmread(img_path)
    
        # Convert to a pixel array
        ds_img = ds.pixel_array
    
        # Check MONOCHROME1 vs MONOCHROME2
        img_photometric = photometric_interpretation(ds, ds_img)

        # Apply windowing
        img_windowed = windowing(ds, img_photometric)
    
        # Normalise the image
        img_normalise = normalise(img_windowed)
    
        # Crop the background
        img_crop = crop_background(ds, img_normalise) 

        # Apply contrast enhancement
        img_clahe = contrast_enhancing(img_crop)

        
        # Try and write the image files created
        try:
            # Save the image
            cv2.imwrite(out_path, img_clahe)
        except:
            # Print the file name if an error occurred
            print("An exception occurred at file ", out_path)

### Cancer

In [49]:
# Write the cancer PNG CC images to the specified folder - 518 images
pre_process(c_CC_list, data_dir_c, out_dir_c_cc)

In [50]:
# Write the cancer PNG MLO images to the specified folder - 518 images
pre_process(c_MLO_list, data_dir_c, out_dir_c_mlo)

### No Cancer

In [51]:
# Write the no cancer PNG CC images to the specified folder - 971 images
pre_process(nc_CC_list, data_dir_nc, out_dir_nc_cc)

In [52]:
# Write the no cancer PNG MLO images to the specified folder - 971 images
pre_process(nc_MLO_list, data_dir_nc, out_dir_nc_mlo)

## Check the Folder Totals

In [53]:
# Cancer - CC view
cancer_CC_pp = os.listdir(out_dir_c_cc)

# Print folder size
print(len(cancer_CC_pp))

518


In [54]:
# Cancer - MLO view
cancer_MLO_pp = os.listdir(out_dir_c_mlo)

# Print folder size
print(len(cancer_MLO_pp))

518


In [55]:
# No cancer - CC view
no_cancer_CC_pp = os.listdir(out_dir_nc_cc)

# Print folder size
print(len(no_cancer_CC_pp))

971


In [56]:
# No cancer - MLO view
no_cancer_MLO_pp = os.listdir(out_dir_nc_mlo)

# Print folder size
print(len(no_cancer_MLO_pp))

971


## Ensure No Doubles

In [57]:
# Combine all the lists
combined_list = cancer_CC_pp + cancer_MLO_pp + no_cancer_CC_pp + no_cancer_MLO_pp

# Create a set from the combined list
combined_set = set(combined_list)

In [58]:
# Compare the lengths to check for duplicates
if len(combined_list) == len(combined_set):
    print("No duplicates found - each item is only in one list.")
else:
    print("Duplicates found - some items appear in multiple lists.")

No duplicates found - each item is only in one list.
