In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import cv2
import numpy as np 	
import os
import xml.etree.ElementTree as et

import matplotlib.pyplot as plt
%matplotlib inline


# Define paths to data folder and the processed data folder
data_path = os.path.join(".", "data")
data_processed_path = os.path.join(data_path, "pre-processed")

# Define paths to benign and malignant data folders
benign_path = os.path.join(data_path, "benign")
malignant_path = os.path.join(data_path, "malignant")

# Define paths for storing processed benign and malignant images
processed_benign_path = os.path.join(data_processed_path, "benign")
processed_malignant_path = os.path.join(data_processed_path, "malignant")

In [2]:
# Process benign images

# Declare image_number; to use later when each folder has multiple images
image_n = 0

# Loop over all patient folders in benign cases
for img_folder in sorted(os.listdir(benign_path)):
    
    # Ignore folders that aren't patient folders
    if len(img_folder) != 3:
        continue
    
    # Store path to current patient folder
    path = os.path.join(benign_path, img_folder)
    
    # Store xml filename for current patient folder
    xml_file = os.path.join(path, img_folder+'.xml')
    
    # Loop over all .jpg files in the current patient folder
    for img in sorted(os.listdir(path)):
        
        # Ignore files that aren't .jpg files (images)
        if img[-4:] != '.jpg':
            continue
        
        # Store path for current image
        path_img = os.path.join(path, img) 

        # Read image
        image = cv2.imread(path_img)
        
        # Convert image to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Store image number for current patient folder; 
        # To use in extracting the right x,y co-ordinate pairs from annotation file
        image_n = int(img[0])
        
        # Collect x,y co-ordinates from annotation file
        
        # Declare image number in each folder
        image_tag = 0
        
        # Parse the xml file to get data points of marked region
        e = et.parse(xml_file).getroot()
        
        # Parse for each child in xml tree
        for child in e:
            
            # Parse for each subchild (image number, and svg x,y pairs) in xml tree:
            for subchild in child:
                
                # Check and store image number to match with image to be processed
                if subchild.tag == 'image':
                    image_tag = int(subchild.text)
                    continue
                
                # Declare arrays to store x,y co-ordinates from annotation file
                x_pts = []
                y_pts = []
                
                # Check to match the image number with the image file and collect x,y co-ordinate pairs 
                if subchild.tag == 'svg' and image_tag == image_n:
                    points = subchild.text
                                        
                    # If no x,y co-ordinate pairs found in the annotation file
                    # Threshold the image and crop the image with the co-ordinates of the 
                    # object with maximum area (To remove other image details not required)
                    if points == None:
                        ret,thresh = cv2.threshold(gray, 0, 255, 0)
                        im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, 
                                                                    cv2.CHAIN_APPROX_SIMPLE)
                        areas = [cv2.contourArea(c) for c in contours]
                        max_index = np.argmax(areas)
                        cnt=contours[max_index]
                        x, y, w, h = cv2.boundingRect(cnt)
                        crop_img = image[y+35:y+h-5,x+25:x+w-10]
                        resize_img = cv2.resize(crop_img, (300, 250), interpolation = cv2.INTER_CUBIC)
                        img_write_path = os.path.join(processed_malignant_path, img_folder+'_'+img)
                        cv2.imwrite(img_write_path, resize_img)
                        continue
                    
                    len_points = len(points)
                    for i in range(0, len_points-47, 1):
                        
                        # Collect all x-points
                        if points[i] == 'x':
                            x_pt = points[i+4]
                            for j in range(1, 3, 1):
                                if not points[i+4+j].isdigit():
                                    break
                                else:
                                    x_pt = x_pt + points[i+4+j]
                            x_pts = np.append(x_pts, int(x_pt))
                        
                        # Collect all y-points
                        if points[i] == 'y':
                            y_pt = points[i+4]
                            for j in range(1, 3, 1):
                                if not points[i+4+j].isdigit():
                                    break
                                else:
                                    y_pt = y_pt + points[i+4+j]
                            y_pts = np.append(y_pts, int(y_pt))
                            
                    # Calculate the minimum and maximum values of x-points and y-points 
                    # to find co-ordinates of cropped image
                    x_min = int(np.min(x_pts))
                    x_max = int(np.max(x_pts))
    
                    y_min = int(np.min(y_pts))
                    y_max = int(np.max(y_pts))
                    
                    # Crop image
                    crop_img = image[y_min:y_max, x_min:x_max]
                    
                    # Resize image
                    resize_img = cv2.resize(crop_img, (300, 250), interpolation=cv2.INTER_CUBIC)
                    
                    # Declare path for storing processed image
                    img_write_path = os.path.join(processed_benign_path, img_folder+'_'+img)
                    
                    # Write and store processed image
                    cv2.imwrite(img_write_path, resize_img)
                    break

In [3]:
# Process malignant images

# Declare image_number; to use later when each folder has multiple images
image_n = 0

# Loop over all patient folders in malignant cases
for img_folder in sorted(os.listdir(malignant_path)):
    
    # Ignore folders that aren't patient folders
    if len(img_folder) != 3:
        continue
        
    # Store path to current patient folder
    path = os.path.join(malignant_path, img_folder)
    
    # Store xml filename for current patient folder
    xml_file = os.path.join(path, img_folder+'.xml')
    
    # Loop over all .jpg files in the current patient folder
    for img in sorted(os.listdir(path)):
        
        # Ignore files that aren't .jpg files (images)
        if img[-4:] != '.jpg':
            continue
        
        # Store path for current image
        path_img = os.path.join(path, img) 
        
        # Read image
        image = cv2.imread(path_img)
        
        # Convert image to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Store image number for current patient folder; 
        # To use in extracting the right x,y co-ordinate pairs from annotation file
        image_n = int(img[0])
        
        # Collect x,y co-ordinates from annotation file
        
        # Declare image number in each folder
        image_tag = 0
        
        # Parse the xml file to get data points of marked region
        e = et.parse(xml_file).getroot()
        
        # Parse for each child in xml tree
        for child in e:
            
            # Parse for each subchild (image number, and svg x,y pairs) in xml tree:
            for subchild in child:
                
                # Check and store image number to match with image to be processed
                if subchild.tag == 'image':
                    image_tag = int(subchild.text)
                    continue
                
                # Declare arrays to store x,y co-ordinates from annotation file
                x_pts = []
                y_pts = []
                
                # Check to match the image number with the image file and collect x,y co-ordinate pairs
                if subchild.tag == 'svg' and image_tag == image_n:
                    points = subchild.text
                    
                    # If no x,y co-ordinate pairs found in the annotation file
                    # Threshold the image and crop the image with the co-ordinates of the 
                    # object with maximum area (To remove other image details not required)
                    if points == None:
                        ret,thresh = cv2.threshold(gray, 0, 255, 0)
                        im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, 
                                                                    cv2.CHAIN_APPROX_SIMPLE)
                        areas = [cv2.contourArea(c) for c in contours]
                        max_index = np.argmax(areas)
                        cnt=contours[max_index]
                        x, y, w, h = cv2.boundingRect(cnt)
                        crop_img = image[y+35:y+h-5,x+25:x+w-10]
                        resize_img = cv2.resize(crop_img, (300, 250), interpolation = cv2.INTER_CUBIC)
                        img_write_path = os.path.join(processed_malignant_path, img_folder+'_'+img)
                        cv2.imwrite(img_write_path, resize_img)
                        continue
                    
                    len_points = len(points)
                    for i in range(0, len_points-47, 1):
                        
                        # Collect all x-points
                        if points[i] == 'x':
                            x_pt = points[i+4]
                            for j in range(1, 3, 1):
                                if not points[i+4+j].isdigit():
                                    break
                                else:
                                    x_pt = x_pt + points[i+4+j]
                            x_pts = np.append(x_pts, int(x_pt))
                        
                        # Collect all y-points
                        if points[i] == 'y':
                            y_pt = points[i+4]
                            for j in range(1, 3, 1):
                                if not points[i+4+j].isdigit():
                                    break
                                else:
                                    y_pt = y_pt + points[i+4+j]
                            y_pts = np.append(y_pts, int(y_pt))
                            
                    # Calculate the minimum and maximum values of x-points and y-points 
                    # to find co-ordinates of cropped image
                    x_min = int(np.min(x_pts))
                    x_max = int(np.max(x_pts))
                    
                    y_min = int(np.min(y_pts))
                    y_max = int(np.max(y_pts))
                    
                    # Crop image
                    crop_img = image[y_min:y_max, x_min:x_max]
                    # Resize image
                    resize_img = cv2.resize(crop_img, (300, 250), interpolation = cv2.INTER_CUBIC)
                    
                    # Declare path for storing processed image
                    img_write_path = os.path.join(processed_malignant_path, img_folder+'_'+img)
                    
                    # Write and store processed image
                    cv2.imwrite(img_write_path, resize_img)
                    break

___