# The Train Validate Test Split Notebook

##### This notebook copies the cc and mlo view according to class to specified folders for training, validation and testing. 
##### The train, validate, test ratio is 75:15:10.

## Import the Relevant Libraries

In [1]:
# Allows for the access to the image directories
import os

In [2]:
# Allows for the copying of images to new folders
import shutil

## Define Paths

### From

In [3]:
# Cancer
cancer_path_cc = "RSNA_PNG_Mammograms/cancer_all/cc/"
cancer_path_mlo = "RSNA_PNG_Mammograms/cancer_all/mlo/"

# No cancer
no_cancer_path_cc = "RSNA_PNG_Mammograms/no_cancer_all/cc/"
no_cancer_path_mlo = "RSNA_PNG_Mammograms/no_cancer_all/mlo/"

### To

In [4]:
# Training
# Cancer
train_cancer_cc  = "RSNA_PNG_Mammograms_Split/train_img/cc/cancer/"
train_cancer_mlo  = "RSNA_PNG_Mammograms_Split/train_img/mlo/cancer/"

# No cancer
train_no_cancer_cc = "RSNA_PNG_Mammograms_Split/train_img/cc/no_cancer/"
train_no_cancer_mlo = "RSNA_PNG_Mammograms_Split/train_img/mlo/no_cancer/"


# Validation
# Cancer
validate_cancer_cc = "RSNA_PNG_Mammograms_Split/val_img/cc/cancer/"
validate_cancer_mlo = "RSNA_PNG_Mammograms_Split/val_img/mlo/cancer/"

# No cancer
validate_no_cancer_cc = "RSNA_PNG_Mammograms_Split/val_img/cc/no_cancer/"
validate_no_cancer_mlo = "RSNA_PNG_Mammograms_Split/val_img/mlo/no_cancer/"


# Testing
# Cancer
test_cancer_cc = "RSNA_PNG_Mammograms_Split/test_img/cc/cancer/"
test_cancer_mlo = "RSNA_PNG_Mammograms_Split/test_img/mlo/cancer/"

# No cancer
test_no_cancer_cc = "RSNA_PNG_Mammograms_Split/test_img/cc/no_cancer/"
test_no_cancer_mlo = "RSNA_PNG_Mammograms_Split/test_img/mlo/no_cancer/"

## Create CC and MLO View, Original and Flipped, Cancer and No Cancer Lists

In [5]:
# Cancer
# Get a list of all the images inside the folder
cancer_cc = os.listdir(cancer_path_cc)
cancer_mlo = os.listdir(cancer_path_mlo)

# No cancer
# Get a list of all the images inside the folder
no_cancer_cc = os.listdir(no_cancer_path_cc)
no_cancer_mlo = os.listdir(no_cancer_path_mlo)

In [6]:
# Cancer
print("Cancer cc view:", len(cancer_cc))
print("Cancer mlo view:", len(cancer_mlo))

print("\n")

# No cancer
print("No cancer cc view:", len(no_cancer_cc))
print("No cancer mlo view:", len(no_cancer_mlo))

Cancer cc view: 1036
Cancer mlo view: 1036


No cancer cc view: 971
No cancer mlo view: 971


In [7]:
# Cancer 
print("Cancer cc view:", cancer_cc[20], cancer_cc[230], cancer_cc[510])
print("Cancer mlo view:", cancer_mlo[20], cancer_mlo[230], cancer_mlo[510])

print("\n")

# No cancer
print("No cancer cc view:", no_cancer_cc[20], no_cancer_cc[230], no_cancer_cc[510])
print("No cancer mlo view:", no_cancer_mlo[20], no_cancer_mlo[230], no_cancer_mlo[510])

Cancer cc view: 10940_420829283.png 2346_129609192.png 38946_1288050525.png
Cancer mlo view: 10940_888426604.png 2346_754579791.png 38946_808034784.png


No cancer cc view: 10847_1601893608.png 2552_189350088.png 42508_430732548.png
No cancer mlo view: 10847_1589228359.png 2552_698117587.png 42508_1992043381.png


## Split Ratio
  - Training:	75%
  - Validation: 15%
  - Testing:	10%

In [8]:
# Define split ratios
train_ratio = 0.75
validate_ratio = 0.15
test_ratio = 0.10

## Calculate Split Indices

### Cancer

In [9]:
ratio_c =  len(cancer_cc)

In [10]:
# Training
train_count_c = int(ratio_c * train_ratio)

# Validation
validate_count_c = int(ratio_c * validate_ratio)

# Testing
test_count_c = ratio_c - train_count_c - validate_count_c

# Print results
print("Training - cancer: ", train_count_c)
print("Validation - cancer: ", validate_count_c)
print("Testing - cancer: ", test_count_c)
print("Total - cancer: ", train_count_c + validate_count_c + test_count_c)

Training - cancer:  777
Validation - cancer:  155
Testing - cancer:  104
Total - cancer:  1036


### No Cancer

In [11]:
ratio_nc =  len(no_cancer_cc)

In [12]:
# Training
train_count_nc = int(ratio_nc * train_ratio)

# Validation
validate_count_nc = int(ratio_nc * validate_ratio)

# Testing
test_count_nc = ratio_nc - train_count_nc - validate_count_nc

# Print results
print("Training - no cancer: ", train_count_nc)
print("Validation - no cancer: ", validate_count_nc)
print("Testing - no cancer: ", test_count_nc)
print("Total - no cancer: ", train_count_nc + validate_count_nc + test_count_nc)

Training - no cancer:  728
Validation - no cancer:  145
Testing - no cancer:  98
Total - no cancer:  971


## Lists According to Split Ratio

### Cancer

In [13]:
# CC view
train_images_cancer_cc_split = cancer_cc[:train_count_c]
validate_images_cancer_cc_split = cancer_cc[train_count_c:train_count_c + validate_count_c]
test_images_cancer_cc_split = cancer_cc[train_count_c + validate_count_c:]

print("Cancer train cc view:", len(train_images_cancer_cc_split))
print("Cancer validate cc view:", len(validate_images_cancer_cc_split))
print("Cancer test cc view:", len(test_images_cancer_cc_split))

Cancer train cc view: 777
Cancer validate cc view: 155
Cancer test cc view: 104


In [14]:
# MLO view
train_images_cancer_mlo_split = cancer_mlo[:train_count_c]
validate_images_cancer_mlo_split = cancer_mlo[train_count_c:train_count_c + validate_count_c]
test_images_cancer_mlo_split = cancer_mlo[train_count_c + validate_count_c:]

print("Cancer train mlo view:", len(train_images_cancer_mlo_split))
print("Cancer validate mlo view:", len(validate_images_cancer_mlo_split))
print("Cancer test mlo view:", len(test_images_cancer_mlo_split))

Cancer train mlo view: 777
Cancer validate mlo view: 155
Cancer test mlo view: 104


### No Cancer

In [15]:
# CC view
train_images_no_cancer_cc_split = no_cancer_cc[:train_count_nc]
validate_images_no_cancer_cc_split = no_cancer_cc[train_count_nc:train_count_nc + validate_count_nc]
test_images_no_cancer_cc_split = no_cancer_cc[train_count_nc + validate_count_nc:]

print("No cancer train cc view:", len(train_images_no_cancer_cc_split))
print("No cancer validate cc view:", len(validate_images_no_cancer_cc_split))
print("No cancer test cc view:", len(test_images_no_cancer_cc_split))

No cancer train cc view: 728
No cancer validate cc view: 145
No cancer test cc view: 98


In [16]:
# MLO view
train_images_no_cancer_mlo_split = no_cancer_mlo[:train_count_nc]
validate_images_no_cancer_mlo_split = no_cancer_mlo[train_count_nc:train_count_nc + validate_count_nc]
test_images_no_cancer_mlo_split = no_cancer_mlo[train_count_nc + validate_count_nc:]

print("No cancer train mlo view:", len(train_images_no_cancer_mlo_split))
print("No cancer validate mlo view:", len(validate_images_no_cancer_mlo_split))
print("No cancer test mlo view:", len(test_images_no_cancer_mlo_split))

No cancer train mlo view: 728
No cancer validate mlo view: 145
No cancer test mlo view: 98


## Printouts to Check Correlation 

#### Note: It is important that the first part before “_”, which is the patient id, of the cc and mlo view correlates.

### Cancer

In [17]:
# Train
print("\n" + "Train:")
# CC view
print("Cancer train cc view:", train_images_cancer_cc_split[4], train_images_cancer_cc_split[76], train_images_cancer_cc_split[143])
# MLO view
print("Cancer train mlo view:", train_images_cancer_mlo_split[4], train_images_cancer_mlo_split[76], train_images_cancer_mlo_split[143])


# Validate
print("\n" + "Validate:")
# CC view
print("Cancer train cc view:", validate_images_cancer_cc_split[3], validate_images_cancer_cc_split[49], validate_images_cancer_cc_split[76])
# MLO view
print("Cancer train mlo view:", validate_images_cancer_mlo_split[3], validate_images_cancer_mlo_split[49], validate_images_cancer_mlo_split[76])


# Test
print("\n" + "Test:")
# CC view
print("Cancer train cc view:", test_images_cancer_cc_split[2], test_images_cancer_cc_split[34], test_images_cancer_cc_split[52])
# MLO view
print("Cancer train mlo view:", test_images_cancer_mlo_split[2], test_images_cancer_mlo_split[34], test_images_cancer_mlo_split[52])


Train:
Cancer train cc view: 10226_530620473.png 13756_1210098281.png 17891_1605416080_flip.png
Cancer train mlo view: 10226_461614796.png 13756_1928377168.png 17891_1991006826_flip.png

Validate:
Cancer train cc view: 55148_1055402003.png 57754_186045163.png 59865_1213747044_flip.png
Cancer train mlo view: 55148_926538150.png 57754_690582134.png 59865_835658415_flip.png

Test:
Cancer train cc view: 64534_1420658905.png 6654_1497844117.png 7196_976434791.png
Cancer train mlo view: 64534_1551728826.png 6654_2126262062.png 7196_1650195108.png


### No Cancer

In [18]:
# Train
print("\n" + "Train:")
# CC view
print("No cancer train cc view:", train_images_no_cancer_cc_split[4], train_images_no_cancer_cc_split[76], train_images_no_cancer_cc_split[143])
# MLO view
print("No cancer train mlo view:", train_images_no_cancer_mlo_split[4], train_images_no_cancer_mlo_split[76], train_images_no_cancer_mlo_split[143])


# Validate
print("\n" + "Validate:")
# CC view
print("No cancer train cc view:", validate_images_no_cancer_cc_split[3], validate_images_no_cancer_cc_split[49], validate_images_no_cancer_cc_split[76])
# MLO view
print("No cancer train mlo view:", validate_images_no_cancer_mlo_split[3], validate_images_no_cancer_mlo_split[49], validate_images_no_cancer_mlo_split[76])


# Test
print("\n" + "Test:")
# CC view
print("No cancer train cc view:", test_images_no_cancer_cc_split[2], test_images_no_cancer_cc_split[34], test_images_no_cancer_cc_split[52])
# MLO view
print("No cancer train mlo view:", test_images_no_cancer_mlo_split[2], test_images_no_cancer_mlo_split[34], test_images_no_cancer_mlo_split[52])


Train:
No cancer train cc view: 10200_534538517.png 13623_1502862823.png 17836_2028731091_flip.png
No cancer train mlo view: 10200_570041037.png 13623_2133628885.png 17836_1983324546_flip.png

Validate:
No cancer train cc view: 54271_559744954_flip.png 56919_1034774361_flip.png 58058_332734942.png
No cancer train mlo view: 54271_1983613191_flip.png 56919_2125303966_flip.png 58058_228471823.png

Test:
No cancer train cc view: 64016_1917790673_flip.png 65431_358163242_flip.png 7291_1200655017_flip.png
No cancer train mlo view: 64016_697957354_flip.png 65431_439189892_flip.png 7291_1145323817_flip.png


## Copy PNG Images to Folders

In [19]:
# Function to move files to the respective folders
def move_files(origin_folder, img_list, destination_folder):
    for png in img_list:
        # Define the source png image path
        source_img_path = origin_folder + png

        # Define the destination png image path
        destination_img_path = destination_folder + png
        
        # Copy the file to the determined destination
        shutil.copy(source_img_path, destination_img_path)

### Cancer

In [20]:
# Move the files - original images
# CC view
move_files(cancer_path_cc, train_images_cancer_cc_split, train_cancer_cc)
move_files(cancer_path_cc, validate_images_cancer_cc_split, validate_cancer_cc)
move_files(cancer_path_cc, test_images_cancer_cc_split, test_cancer_cc)

# MLO view
move_files(cancer_path_mlo, train_images_cancer_mlo_split, train_cancer_mlo)
move_files(cancer_path_mlo, validate_images_cancer_mlo_split, validate_cancer_mlo)
move_files(cancer_path_mlo, test_images_cancer_mlo_split, test_cancer_mlo)

### No Cancer

In [21]:
# Move the files - original images
# CC view
move_files(no_cancer_path_cc, train_images_no_cancer_cc_split, train_no_cancer_cc)
move_files(no_cancer_path_cc, validate_images_no_cancer_cc_split, validate_no_cancer_cc)
move_files(no_cancer_path_cc, test_images_no_cancer_cc_split, test_no_cancer_cc)

# MLO view
move_files(no_cancer_path_mlo, train_images_no_cancer_mlo_split, train_no_cancer_mlo)
move_files(no_cancer_path_mlo, validate_images_no_cancer_mlo_split, validate_no_cancer_mlo)
move_files(no_cancer_path_mlo, test_images_no_cancer_mlo_split, test_no_cancer_mlo)

## Test the Path Totals

### Train

In [22]:
# Cancer
# CC view
total_train_cancer_cc =  os.listdir(train_cancer_cc)
# MLO view
total_train_cancer_mlo =  os.listdir(train_cancer_mlo)

# Print the total amount of images for training - cancer
print(len(total_train_cancer_cc) + len(total_train_cancer_mlo))

1554


In [23]:
# No cancer
# CC view
total_train_no_cancer_cc =  os.listdir(train_no_cancer_cc)
# MLO view
total_train_no_cancer_mlo =  os.listdir(train_no_cancer_mlo)

# Print the total amount of images for training - no cancer
print(len(total_train_no_cancer_cc) + len(total_train_no_cancer_mlo))

1456


### Validate

In [24]:
# Cancer
# CC view
total_validate_cancer_cc =  os.listdir(validate_cancer_cc)
# MLO view
total_validate_cancer_mlo =  os.listdir(validate_cancer_mlo)

# Print the total amount of images for training - cancer
print(len(total_validate_cancer_cc) + len(total_validate_cancer_mlo))

310


In [25]:
# No cancer
# CC view
total_validate_no_cancer_cc =  os.listdir(validate_no_cancer_cc)
# MLO view
total_validate_no_cancer_mlo =  os.listdir(validate_no_cancer_mlo)

# Print the total amount of images for training - no cancer
print(len(total_validate_no_cancer_cc) + len(total_validate_no_cancer_mlo))

290


### Test

In [26]:
# Cancer
# CC view
total_test_cancer_cc =  os.listdir(test_cancer_cc)
# MLO view
total_test_cancer_mlo =  os.listdir(test_cancer_mlo)

# Print the total amount of images for testing - cancer
print(len(total_test_cancer_cc) + len(total_test_cancer_mlo))

208


In [27]:
# No cancer
# CC view
total_test_no_cancer_cc =  os.listdir(test_no_cancer_cc)
# MLO view
total_test_no_cancer_mlo =  os.listdir(test_no_cancer_mlo)

# Print the total amount of images for testing - no cancer
print(len(total_test_no_cancer_cc) + len(total_test_no_cancer_mlo))

196


## Total Images for CNN

In [28]:
training = len(total_train_cancer_cc) + len(total_train_cancer_mlo) + len(total_train_no_cancer_cc) + len(total_train_no_cancer_mlo)
validation = len(total_validate_cancer_cc) + len(total_validate_cancer_mlo) + len(total_validate_no_cancer_cc) + len(total_validate_no_cancer_mlo)
testing = len(total_test_cancer_cc) + len(total_test_cancer_mlo) + len(total_test_no_cancer_cc) + len(total_test_no_cancer_mlo)

In [29]:
print("Total image set: ", (training + validation + testing))

Total image set:  4014


## Ensure No Errors

In [30]:
# Combine all training lists
combo_train = total_train_cancer_cc + total_train_cancer_mlo + total_train_no_cancer_cc + total_train_no_cancer_mlo

# Combine all validation lists
combo_validate = total_validate_cancer_cc + total_validate_cancer_mlo + total_validate_no_cancer_cc + total_validate_no_cancer_mlo

# Combine all testing lists
combo_test = total_test_cancer_cc + total_test_cancer_mlo + total_test_no_cancer_cc + total_test_no_cancer_mlo

In [31]:
# Combine all lists into one
combined_list = combo_train + combo_validate + combo_test

# Convert the combined list into a set
combined_set = set(combined_list)

# Compare the lengths to check for duplicates
if len(combined_list) == len(combined_set):
    print("No duplicates found - each item is only in one list.")
else:
    print("Duplicates found - some items appear in multiple lists.")

No duplicates found - each item is only in one list.
