# The Data Augmentation Notebook

##### This notebook flips all cancer images to increase the dataset size.
##### Flips every second no cancer image.

## Import the Relevant Libraries

In [1]:
# Allows for the access to the image directories
import os

In [2]:
# Allows me to apply data augmentation in the form of flipping 
import cv2

In [3]:
# Allows me to work with and manipulate the image array
import numpy as np

In [4]:
# Allows me to read the images
from PIL import Image

In [5]:
# Allows me to apply random selection
import random

In [6]:
# Allows for the copying of images to new folders
import shutil

## Define Paths

### From

In [7]:
# Cancer
cancer_path_cc = "RSNA_PNG_Mammograms/cancer/cc/"
cancer_path_mlo = "RSNA_PNG_Mammograms/cancer/mlo/"

# No cancer
no_cancer_path_cc = "RSNA_PNG_Mammograms/no_cancer/cc/"
no_cancer_path_mlo = "RSNA_PNG_Mammograms/no_cancer/mlo/"

### To

In [8]:
# Cancer
flip_dir_cancer_cc = "RSNA_PNG_Mammograms/cancer_aug_flip/cc/"
flip_dir_cancer_mlo = "RSNA_PNG_Mammograms/cancer_aug_flip/mlo/"

# No cancer
flip_dir_no_cancer_cc = "RSNA_PNG_Mammograms/no_cancer_aug_flip/cc/"
flip_dir_no_cancer_mlo = "RSNA_PNG_Mammograms/no_cancer_aug_flip/mlo/"

## Test the Paths

### Cancer

In [9]:
cancer_dataset_cc =  os.listdir(cancer_path_cc)
cancer_dataset_cc[1]

'10130_613462606.png'

In [10]:
cancer_dataset_mlo =  os.listdir(cancer_path_mlo)
cancer_dataset_mlo[1]

'10130_388811999.png'

In [11]:
print(len(cancer_dataset_cc) + len(cancer_dataset_mlo))

1036


### No Cancer

In [12]:
no_cancer_dataset_cc =  os.listdir(no_cancer_path_cc)
no_cancer_dataset_cc[1]

'10200_134267365.png'

In [13]:
no_cancer_dataset_mlo =  os.listdir(no_cancer_path_mlo)
no_cancer_dataset_mlo[1]

'10200_1422756511.png'

In [14]:
print(len(no_cancer_dataset_cc) + len(no_cancer_dataset_mlo))

1940


## Ensure No Errors

In [15]:
# Combine all lists into one
combined_list = cancer_dataset_cc + cancer_dataset_mlo + no_cancer_dataset_cc + no_cancer_dataset_mlo

In [16]:
# Convert the combined list into a set
combined_set = set(combined_list)

In [17]:
# Compare the lengths to check for duplicates
if len(combined_list) == len(combined_set):
    print("No duplicates found - each item is only in one list.")
else:
    print("Duplicates found - some items appear in multiple lists.")

No duplicates found - each item is only in one list.


## Data Augmentation - Flip Along the Vertical Axis

In [18]:
# Function for flipping images along the vertical axis 
def augment_flipping(from_path, dataset, to_path, n):
    # Iterate through the determined folder
    for img in dataset[::n]:
        # Set the path
        path = from_path + img
        
        # Read the image in grayscale mode 
        img_gray = cv2.imread(path, 0) 
    
        # Flip the image vertically
        img_flip = cv2.flip(img_gray, 1)
        
        # Set the name of the image
        out_path = to_path + img[:-4] + "_flip.png"
        
        # Save the image
        cv2.imwrite(out_path, img_flip)

### Cancer
Flip all cancer images to increase the dataset size.

In [19]:
augment_flipping(cancer_path_cc, cancer_dataset_cc, flip_dir_cancer_cc, 1)

In [20]:
augment_flipping(cancer_path_mlo, cancer_dataset_mlo, flip_dir_cancer_mlo, 1)

### No Cancer
Flip every second no cancer image.

In [21]:
augment_flipping(no_cancer_path_cc, no_cancer_dataset_cc, flip_dir_no_cancer_cc, 2)

In [22]:
augment_flipping(no_cancer_path_mlo, no_cancer_dataset_mlo, flip_dir_no_cancer_mlo, 2)

## Test the Path Totals

### Cancer

In [23]:
test_cancer_dataset_cc =  os.listdir(cancer_path_cc)
test_cancer_dataset_mlo =  os.listdir(cancer_path_mlo)

In [24]:
test_cancer_dataset_cc_flip =  os.listdir(flip_dir_cancer_cc)
test_cancer_dataset_mlo_flip =  os.listdir(flip_dir_cancer_mlo)

In [25]:
cancer_file_1 = len(test_cancer_dataset_cc)
cancer_file_2 = len(test_cancer_dataset_mlo)
cancer_file_3 = len(test_cancer_dataset_cc_flip)
cancer_file_4 = len(test_cancer_dataset_mlo_flip)

In [26]:
print("Cancer images cc view: " + str(cancer_file_1))
print("Cancer images mlo view: " + str(cancer_file_2))
print("Cancer images cc flipped view: " + str(cancer_file_3))
print("Cancer images mlo flipped view: " + str(cancer_file_4))

print("Total: " + str(cancer_file_1 + cancer_file_2 + cancer_file_3 + cancer_file_4))

Cancer images cc view: 518
Cancer images mlo view: 518
Cancer images cc flipped view: 518
Cancer images mlo flipped view: 518
Total: 2072


### No Cancer

In [27]:
test_no_cancer_dataset_cc =  os.listdir(no_cancer_path_cc)
test_no_cancer_dataset_mlo =  os.listdir(no_cancer_path_mlo)

In [28]:
test_no_cancer_dataset_cc_flip =  os.listdir(flip_dir_no_cancer_cc)
test_no_cancer_dataset_mlo_flip =  os.listdir(flip_dir_no_cancer_mlo)

In [29]:
no_cancer_file_1 = len(test_no_cancer_dataset_cc)
no_cancer_file_2 = len(test_no_cancer_dataset_mlo)
no_cancer_file_3 = len(test_no_cancer_dataset_cc_flip)
no_cancer_file_4 = len(test_no_cancer_dataset_mlo_flip)

In [30]:
print("No cancer images cc view: " + str(no_cancer_file_1))
print("No cancer images mlo view: " + str(no_cancer_file_2))
print("No cancer images cc flipped view: " + str(no_cancer_file_3))
print("No cancer images mlo flipped view: " + str(no_cancer_file_4))

print("Total: " + str(no_cancer_file_1 + no_cancer_file_2 + no_cancer_file_3 + no_cancer_file_4))

No cancer images cc view: 970
No cancer images mlo view: 970
No cancer images cc flipped view: 485
No cancer images mlo flipped view: 485
Total: 2910


## Move Original and Flipped Images to One Folder Before Splitting

### To

In [31]:
# Cancer
all_cancer_cc = "RSNA_PNG_Mammograms/cancer_all/cc/"
all_cancer_mlo = "RSNA_PNG_Mammograms/cancer_all/mlo/"

# No cancer
all_no_cancer_cc = "RSNA_PNG_Mammograms/no_cancer_all/cc/"
all_no_cancer_mlo = "RSNA_PNG_Mammograms/no_cancer_all/mlo/"

In [33]:
def split_copy(in_path, dataset, out_path):
    for png in dataset:
        # Define the source png image path
        source_img_path = in_path + png
    
        # Define the destination png image path
        destination_img_path = out_path + png
    
        # Copy the file to the determined destination
        shutil.copy(source_img_path, destination_img_path)

#### Cancer

#### CC View

In [34]:
# Original left image set - copy images
split_copy(cancer_path_cc, test_cancer_dataset_cc, all_cancer_cc)

In [35]:
# Flipped right image set - copy images
split_copy(flip_dir_cancer_cc, test_cancer_dataset_cc_flip, all_cancer_cc)

#### MLO View

In [36]:
# Original left image set - copy images
split_copy(cancer_path_mlo, test_cancer_dataset_mlo, all_cancer_mlo)

In [37]:
# Flipped right image set - copy images
split_copy(flip_dir_cancer_mlo, test_cancer_dataset_mlo_flip, all_cancer_mlo)

#### No Cancer

#### CC View

In [38]:
# Original left first image - copy images
split_copy(no_cancer_path_cc, test_no_cancer_dataset_cc[0:1], all_no_cancer_cc)

In [39]:
# Original left image set every second image - copy images
split_copy(no_cancer_path_cc, test_no_cancer_dataset_cc[1::2], all_no_cancer_cc)

In [40]:
# Flipped right image set - copy images
split_copy(flip_dir_no_cancer_cc, test_no_cancer_dataset_cc_flip, all_no_cancer_cc)

#### MLO View

In [41]:
# Original left first image - copy images
split_copy(no_cancer_path_mlo, test_no_cancer_dataset_mlo[0:1], all_no_cancer_mlo)

In [42]:
# Original left image set every second image - copy images
split_copy(no_cancer_path_mlo, test_no_cancer_dataset_mlo[1::2], all_no_cancer_mlo)

In [43]:
# Flipped right image set - copy images
split_copy(flip_dir_no_cancer_mlo, test_no_cancer_dataset_mlo_flip, all_no_cancer_mlo)

## Test the Path Totals

### Cancer

In [44]:
total_cancer_dataset_cc = os.listdir(all_cancer_cc)
total_cancer_dataset_mlo = os.listdir(all_cancer_mlo)

print("Cancer cc view: ", len(total_cancer_dataset_cc))
print("Cancer mlo view: ", len(total_cancer_dataset_mlo))

Cancer cc view:  1036
Cancer mlo view:  1036


### No Cancer

In [45]:
total_no_cancer_dataset_cc = os.listdir(all_no_cancer_cc)
total_no_cancer_dataset_mlo = os.listdir(all_no_cancer_mlo)

print("No cancer cc view: ", len(total_no_cancer_dataset_cc))
print("No cancer mlo view: ", len(total_no_cancer_dataset_mlo))

No cancer cc view:  971
No cancer mlo view:  971


## Ensure No Doubles

In [46]:
# Combine all the lists
combined_list = total_cancer_dataset_cc + total_cancer_dataset_mlo + total_no_cancer_dataset_cc + total_no_cancer_dataset_mlo

# Create a set from the combined list
combined_set = set(combined_list)

In [47]:
# Compare the lengths to check for duplicates
if len(combined_list) == len(combined_set):
    print("No duplicates found - each item is only in one list.")
else:
    print("Duplicates found - some items appear in multiple lists.")

No duplicates found - each item is only in one list.
