# The DICOM to DICOM No Subfolder Class Notebook

##### Reads the newly created cancer and no cancer CSV files.
##### Compiles lists containing the relevant patient_image_id, patient_id and image_id according to class (cancer or no cancer).
##### Moves the selected DICOM files to the relevant folders.

## Import the Relevant Libraries

In [1]:
# Allows for the access to the image directories
import os

In [2]:
# Allows for the copying of images to new folders
import shutil

In [3]:
# Allows for dataframe creation as well as manipulation and cleaning of the data
import pandas as pd

## Import and Read the CSV File - Cancer 

In [4]:
# Metadata of the training images - cancer
cancer_meta = pd.read_csv("new_sorted_cancer_df.csv")

In [5]:
# Display the first 2 rows of the metadata
cancer_meta.head(2)

Unnamed: 0,patient_image_id,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,10130_1360338805,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False
1,10130_1672636630,1,10130,1672636630,L,MLO,71.0,1,1,1,0.0,0,B,49,False


In [6]:
# Display the length the metadata
print(len(cancer_meta))

1158


## Import and Read the CSV File - No Cancer 

In [7]:
# Metadata of the training images - no cancer
no_cancer_meta = pd.read_csv("new_sorted_non_cancer_df.csv")

In [8]:
# Display the lats 2 rows of the metadata
no_cancer_meta.tail(2)

Unnamed: 0,patient_image_id,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
2263,9549_760279512,1,9549,760279512,L,MLO,48.0,0,0,0,2.0,0,A,49,False
2264,9549_856218783,1,9549,856218783,R,CC,48.0,0,0,0,2.0,0,A,49,False


In [9]:
# Display the length the metadata
print(len(no_cancer_meta))

2265


In [10]:
cancer = len(cancer_meta)
no_cancer = len(no_cancer_meta)

img_qty_difference = round(((no_cancer - cancer) * 100) / no_cancer, 2)

print("The image quantity difference between the two classes is  " + str(img_qty_difference) + " %")

The image quantity difference between the two classes is  48.87 %


## Compile Cancer and No Cancer ID Lists

In [11]:
# Create lists that will contain the relevant image id's - cancer
cancer_id = cancer_meta["patient_image_id"].tolist()
cancer_pt_id = cancer_meta["patient_id"].tolist()
cancer_img_id = cancer_meta["image_id"].tolist()

In [12]:
# Create list that will contain the relevant image id's - no cancer
no_cancer_id = no_cancer_meta["patient_image_id"].tolist()
no_cancer_pt_id = no_cancer_meta["patient_id"].tolist()
no_cancer_img_id = no_cancer_meta["image_id"].tolist()

In [13]:
# Compare sizes - cancer
print(len(cancer_id))
print(len(cancer_meta))

1158
1158


In [14]:
# Compare sizes - no cancer
print(len(no_cancer_id))
print(len(no_cancer_meta))

2265
2265


In [15]:
# Demonstrate the format - cancer list
cancer_id[0:10]

['10130_1360338805',
 '10130_1672636630',
 '10130_388811999',
 '10130_613462606',
 '10226_461614796',
 '10226_530620473',
 '1025_1803952236',
 '1025_773597682',
 '10432_1434858530',
 '10432_458553546']

In [16]:
# Demonstrate the format - no cancer list
no_cancer_id[20:30]

['10399_1470470799',
 '10399_52510266',
 '10399_863598426',
 '10526_1505027709',
 '10526_1782974695',
 '10526_376934385',
 '10526_657541655',
 '10526_880953279',
 '10528_1472804609',
 '10528_2124356768']

## Copy DICOM Images to Class Folders

In [18]:
# Define the source directory
data_dir = "F:/RSNA_Mammograms/train_images/"

In [19]:
# Define the destination directories
data_dir_cancer = "RSNA_Mammograms_Class/cancer/"
data_dir_no_cancer = "RSNA_Mammograms_Class/no_cancer/"

### The class_shutil function 
 - Takes four inputs:
    * id = patient_image_id
    * pt_id = patient_id
    * img_id = image_id
    * out_path
 - The above 3 id’s represent the id’s that were added from the CSV files to the depending lists.
 - out_path represent the destination directory depending on the class (cancer or no cancer).
 - Thus, out_path defines where the DICOM image will be copied to.
 - Then, set the destination path by combining the out_path and the specified image before copying it.

In [20]:
def class_shutil(id, pt_id, img_id, out_path):   
    # If the img patient_image_id is in the cancer list
    for i, img in enumerate(id):
        # Define the source png image path
        source_img_path = data_dir + str(pt_id[i]) + "/" + str(img_id[i]) + ".dcm"

        # Define the destination png image path
        destination_img_path = out_path + img + ".dcm"

        # Copy the file to the determined destination
        shutil.copy(source_img_path, destination_img_path)

In [21]:
# Copy the cancer images
class_shutil(cancer_id, cancer_pt_id, cancer_img_id, data_dir_cancer)

In [22]:
# Copy the no cancer images 
class_shutil(no_cancer_id, no_cancer_pt_id, no_cancer_img_id, data_dir_no_cancer)

## Check Folders

In [23]:
# Get a list of all the patient images in this folder - cancer
cancer_dir = os.listdir(data_dir_cancer)

# Compare sizes - cancer
print(len(cancer_dir))
print(len(cancer_meta))

1158
1158


In [24]:
# Get a list of all the patient images in this folder - no cancer
no_cancer_dir = os.listdir(data_dir_no_cancer)

# Compare sizes - cancer
print(len(no_cancer_dir))
print(len(no_cancer_meta))

2265
2265
