# Automate X-Ray Preprocessing Pipeline
In this notebook, we automate the x-ray preprocessing pipeline to take in the path to a directory containing x-ray images, and automatically preprocess and create an output folder with all processed versions of x-ray images in the original directory

Investigation and creation of pipeline is detailed in [Left Wrist X-Ray Pre-Processing Investigation](./Left%20Wrist%20X-Ray%20Image%20Pre-Processing%20Investigation.ipynb)

In [1]:
# import libraries
import time

import tensorflow
import keras
import os
import glob
from skimage import io, transform, exposure
import skimage
import random
import numpy as np
import matplotlib.pyplot as plt
import warnings
%matplotlib inline

from numpy import expand_dims
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator

import cv2

import warnings

warnings.filterwarnings('ignore', 'FutureWarning')

Using TensorFlow backend.


In [10]:
# path for folder containing test x-rays
dataset_path = 'data/validation/'

# adds paths of all files inside of dataset folder
# excludes other folders, directories
file_paths = [f for f in os.listdir(dataset_path) if os.path.isfile(os.path.join(dataset_path, f))]
file_paths = [dataset_path + f for f in  file_paths]

file_paths

['data/validation/10686.png',
 'data/validation/11573.png',
 'data/validation/12708.png',
 'data/validation/9727.png',
 'data/validation/11759.png',
 'data/validation/14609.png',
 'data/validation/11017.png',
 'data/validation/9848.png',
 'data/validation/11188.png',
 'data/validation/10241.png',
 'data/validation/10255.png',
 'data/validation/13774.png',
 'data/validation/15305.png',
 'data/validation/10269.png',
 'data/validation/12318.png',
 'data/validation/15477.png',
 'data/validation/14580.png',
 'data/validation/12132.png',
 'data/validation/13238.png',
 'data/validation/14219.png',
 'data/validation/13576.png',
 'data/validation/13210.png',
 'data/validation/13577.png',
 'data/validation/14218.png',
 'data/validation/12127.png',
 'data/validation/14230.png',
 'data/validation/12872.png',
 'data/validation/10268.png',
 'data/validation/14740.png',
 'data/validation/13946.png',
 'data/validation/15489.png',
 'data/validation/10283.png',
 'data/validation/12494.png',
 'data/valid

In [3]:
len(file_paths)

177

In [4]:
# helper function
# returns array of longest length that holds contiguous values
def longestConseqSubArr(v):
    if not v: return []
    
    ans = 0
    count = 0
    start = 0
    end = 0
    output = []

    for i in range(0, len(v)):
        if (i > 0 and v[i] == v[i - 1] + 1):
            count = count + 1
            end = i
        else:
            start = i
            count = 1
 
        if (ans < count):
            ans = count
            x = start
            y = end
        
    return v[x:y+1]

In [5]:
# helper function
# keeps aspect ratio while resizing to 256x256, adds black pixels to create square
def resize_image(img, size=(256,256)):

    h, w = img.shape[:2]
    c = img.shape[2] if len(img.shape)>2 else 1

    if h == w: 
        return cv2.resize(img, size, cv2.INTER_AREA)

    dif = h if h > w else w

    interpolation = cv2.INTER_AREA if dif > (size[0]+size[1])//2 else cv2.INTER_CUBIC

    x_pos = (dif - w)//2
    y_pos = (dif - h)//2

    if len(img.shape) == 2:
        mask = np.zeros((dif, dif), dtype=img.dtype)
        mask[y_pos:y_pos+h, x_pos:x_pos+w] = img[:h, :w]
    else:
        mask = np.zeros((dif, dif, c), dtype=img.dtype)
        mask[y_pos:y_pos+h, x_pos:x_pos+w, :] = img[:h, :w, :]

    return cv2.resize(mask, size, interpolation)

In [6]:
# Pre-Processing Pipeline
def preprocess_pipeline(file_paths, padding_thresh, image_thresh):
    
    # iterate through all images in directory
    for file in file_paths:
        # extract name of file without extensions
        # EX: test/0001.png --> 0001
        fileNameOnly = file.split('/')[2].split('.')[0]
        
        # read in image using skimage
        try:
            image = io.imread(file)
        except ValueError:
            print(file + " was not compatible file type.")
            pass
        # equalize image using CLAHE normalization
        eq_img = exposure.equalize_adapthist( image )
        
        # calculate cutoff value for insignficant value threshold
        cutoff = np.mean(eq_img[0]) + padding_thresh

        for row in eq_img:
            if(np.mean(row) < cutoff):
                pass
            else:
                if(np.mean(row) > cutoff):
                    break
                else:
                    cutoff = np.mean(row) + padding_thresh
        
        # find rows / columns to remove
        # initialize two lists, one for top margin to be removed, one for all other values
        # indexes --> rows to be removed
        indexes = []
        indexes2 = []
        # indCol --> columns to be removed
        indCol = []
        indCol2 = []

        index = 0
        # iterate through all rows in image to find those that are below threshold
        for row in eq_img:
            if(np.mean(row) < cutoff):
                # if row before is also insignificant, add to indexes (creates continuous area)
                if (index - 1) in indexes:
                    indexes.append(index)
                # all other rows are added to indexes2
                else:
                    indexes2.append(index)

            index += 1

        index = 0
        # iterate through all columns in TRANSPOSED image to find those that are below threshold
        for column in eq_img.T:
            if(np.mean(column) < cutoff):
                if (index - 1) in indexes:
                    # if column before is also insignificant, add to indexes (creates continuous area)
                    indCol.append(index)
                else:
                    # all other columns are added to indexes2
                    indCol2.append(index)

            index += 1
            
        # take remaining, not necessarily continuous rows / columns and find largest continuous area to remove
        # prevents lower light levels within significant x-ray from being removed
        # removes only the largest area at the top/left and the largest area at the bottom/right, more likely to catch insignificant data only
        indexes2 = longestConseqSubArr(indexes2)
        indCol2 = longestConseqSubArr(indCol2)
        
        # combines all rows to be deleted
        indexes += indexes2
        # combines all columns to be deleted
        indCol += indCol2
        
        # if more than 60% of image is being removed, keep original equalized image
        #    - serves purpose of preventing x-rays that are significant from top to bottom from being deleted
#         if (len(indexes) > (0.6 * image.shape[0])) or (len(indCol) > (0.6 * image.shape[1])):
        if ( (len(indexes) + len(indCol)) > (image_thresh*(image.shape[0] + image.shape[1])) ):
            cropped_img = eq_img
        else:
            # crop out rows and columns otherwise
            cropped_img = np.delete(eq_img, indexes, 0)
            cropped_img = np.delete(cropped_img, indCol, 1)
            
        # convert to grayscale, normalize pixel values to [0,1] range, resize image to 256x256
        gray_img = skimage.color.rgb2gray(cropped_img)
        norm_img = (gray_img - np.min(gray_img))/(np.max(gray_img) - np.min(gray_img))
        resized_img = resize_image(norm_img)
        # add channel dimension
        resized_img = transform.resize(resized_img, (256,256,1))
        
        # save image
        # EX File Name: test/0001.png
        # EX Saved File Name: output_test/0001_PROCESSED.png
        io.imsave('pipeline_outputs/' + dataset_path + fileNameOnly + '_PROCESSED.png', resized_img)

In [11]:
# hyperparameters
padding_thresh = 0
image_thresh = 0.33

start = time.time()
preprocess_pipeline(file_paths, padding_thresh, image_thresh)
end = time.time()

print(end - start)





















138.24338006973267


# Testing Cells

In [7]:
file = 'test/1209.png'

split = file.split('/', 1)[1].split('.', 1)[0]

split

'1209'

In [8]:
lists = [2342, 23, 5234, 569, 29]
lists = [i for i in lists if i < 1000]

lists

[23, 569, 29]

In [9]:
list1 = [1, 2]
list2 = []

x = [3, 4, 6, 8, 12, 13, 14, 15, 36]
for i in x:
    if i-1 in list1:
        list1.append(i)
    else:
        list2.append(i)
        
print(list1)
print(list2)

[1, 2, 3, 4]
[6, 8, 12, 13, 14, 15, 36]


In [10]:
print(longestConseqSubArr(list2))

[12, 13, 14, 15]
