In [1]:
import cv2
import csv
import os
import sys
import scipy
import numpy
import matplotlib
import pandas
import sklearn

In [2]:
# Load libraries
from pandas import read_csv
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Reference no.1 = https://blog.finxter.com/5-best-ways-to-remove-black-background-and-make-it-transparent-using-opencv-python/
# Reference no.2 = https://www.geeksforgeeks.org/image-thresholding-in-python-opencv/
# Reference no.3 = https://docs.opencv.org/3.4/d7/d4d/tutorial_py_thresholding.html
# Reference no.4 = https://herts.instructure.com/courses/112101/pages/running-python-simply?module_item_id=36661346
# Reference no.5 = https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
# Reference no.6 = https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html
# Reference no.7 = https://www.geeksforgeeks.org/python-extracting-rows-using-pandas-iloc/
# Reference no.8 = https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.values.html
# Reference no.9 = https://stackoverflow.com/questions/37891954/keras-how-do-i-predict-after-i-trained-a-model

In [4]:
# The directory path where the healthy images are located
directory = "C:/images/NORMAL" 

# Get all the image files in the directory
image_files = [f for f in os.listdir(directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.jfif' ))]

# Process each image file
for image_file in image_files:
    # Read the image
    image_path = os.path.join(directory, image_file)
    image = cv2.imread(image_path)
    
    # Convert the image to greyscale
    grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply CLAHE (Contrast-Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))  # Set CLAHE parameters
    equalised_image = clahe.apply(grey_image)  # Apply CLAHE

    # Apply Otsu's thresholding to enhance lung regions
    _, thresholded = cv2.threshold(equalised_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Find contours
    contours, _ = cv2.findContours(thresholded, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:  # Ensure at least one contour is found
        # Assume the largest external contour is the object to keep
        largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        
        # Crop the image using the dimensions of the bounding rectangle
        crop = equalised_image[y:y+h, x:x+w]
        
        # Resize image to 224x224 for deep learning models
        resized_crop = cv2.resize(crop, (224, 224))

        # Normalize pixel values to range [0,1]
        normalized_image = resized_crop / 255.0

        # Create new alpha channel with same dimensions as cropped image
        alpha_channel = numpy.ones(resized_crop.shape[:2], dtype='uint8') * 255
        
        # Add alpha channel to cropped image
        rgba = cv2.merge((*cv2.split(resized_crop), alpha_channel))

        # Create the CSV file name by removing the image extension and adding .csv
        csv_file_name = os.path.splitext(image_file)[0] + '.csv'
        csv_file_path = os.path.join(directory, csv_file_name)

        # Open the CSV file and write the headers and greyscale values
        with open(csv_file_path, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Greyscale"])

            # Write the greyscale values for each pixel
            for i in range(resized_crop.shape[0]):
                for j in range(resized_crop.shape[1]):
                    grey_value = resized_crop[i, j]
                    writer.writerow([grey_value])

print("All normal files processed.")

All normal files processed.


In [None]:
# Directory containing the CSV files
summary_file_path = os.path.join(directory, "normal summary means.csv")

# Initialise a list to store the summary data
summary_data = []

# Process each CSV file in the directory
for csv_file in os.listdir(directory):
    if csv_file.endswith('.csv'):
        csv_file_path = os.path.join(directory, csv_file)
        with open(csv_file_path, 'r', newline='') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header

            # Initialise the sum and count variables
            grey_sum = 0
            count = 0

            # Sum greyscale values
            for row in reader:
                if row:  # Check if the row is not empty
                    grey_value = float(row[0])  # There's only one greyscale value per row
                    grey_sum += grey_value
                    count += 1

            # Calculate the mean greyscale value
            grey_mean = grey_sum / count if count else 0

            # Add the calculated mean to the summary list with the file name as "Building" column
            summary_data.append([grey_mean, "Normal", os.path.splitext(csv_file)[0]])

# Write the summary data to a new CSV file with the appropriate headers
with open(summary_file_path, 'w', newline='') as summary_file:
    writer = csv.writer(summary_file)
    writer.writerow(["Greyscale Mean", "Category", "File Name"])
    for data in summary_data:
        writer.writerow(data)

print("Summary of means for normal has been written.")

In [None]:
# The directory path where the healthy images are located
directory = "C:/images/PNEUMONIA" 

# Get all the image files in the directory
image_files = [f for f in os.listdir(directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.jfif' ))]

# Process each image file
for image_file in image_files:
    # Read the image
    image_path = os.path.join(directory, image_file)
    image = cv2.imread(image_path)
    
    # Convert the image to greyscale
    grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply CLAHE (Contrast-Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))  # Set CLAHE parameters
    equalised_image = clahe.apply(grey_image)  # Apply CLAHE

    # Apply Otsu's thresholding to enhance lung regions
    _, thresholded = cv2.threshold(equalised_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Find contours
    contours, _ = cv2.findContours(thresholded, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:  # Ensure at least one contour is found
        # Assume the largest external contour is the object to keep
        largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        
        # Crop the image using the dimensions of the bounding rectangle
        crop = equalised_image[y:y+h, x:x+w]
        
        # Resize image to 224x224 for deep learning models
        resized_crop = cv2.resize(crop, (224, 224))

        # Normalize pixel values to range [0,1]
        normalized_image = resized_crop / 255.0

        # Create new alpha channel with same dimensions as cropped image
        alpha_channel = numpy.ones(resized_crop.shape[:2], dtype='uint8') * 255
        
        # Add alpha channel to cropped image
        rgba = cv2.merge((*cv2.split(resized_crop), alpha_channel))

        # Create the CSV file name by removing the image extension and adding .csv
        csv_file_name = os.path.splitext(image_file)[0] + '.csv'
        csv_file_path = os.path.join(directory, csv_file_name)

        # Open the CSV file and write the headers and greyscale values
        with open(csv_file_path, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Greyscale"])

            # Write the greyscale values for each pixel
            for i in range(resized_crop.shape[0]):
                for j in range(resized_crop.shape[1]):
                    grey_value = resized_crop[i, j]
                    writer.writerow([grey_value])

print("All pneumonia files processed.")

In [None]:
# Directory containing the CSV files
summary_file_path = os.path.join(directory, "pneumonia summary means.csv")

# Initialise a list to store the summary data
summary_data = []

# Process each CSV file in the directory
for csv_file in os.listdir(directory):
    if csv_file.endswith('.csv'):
        csv_file_path = os.path.join(directory, csv_file)
        with open(csv_file_path, 'r', newline='') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header

            # Initialise the sum and count variables
            grey_sum = 0
            count = 0

            # Sum greyscale values
            for row in reader:
                if row:  # Check if the row is not empty
                    grey_value = float(row[0])  # There's only one greyscale value per row
                    grey_sum += grey_value
                    count += 1

            # Calculate the mean greyscale value
            grey_mean = grey_sum / count if count else 0

            # Add the calculated mean to the summary list with the file name as "Building" column
            summary_data.append([grey_mean, "Pneumonia", os.path.splitext(csv_file)[0]])

# Write the summary data to a new CSV file with the appropriate headers
with open(summary_file_path, 'w', newline='') as summary_file:
    writer = csv.writer(summary_file)
    writer.writerow(["Greyscale Mean", "Category", "File Name"])
    for data in summary_data:
        writer.writerow(data)

print("Summary of means for pneumonia has been written.")

In [None]:
# The directory path where the healthy images are located
directory = "C:/images/TEST SET" 

# Get all the image files in the directory
image_files = [f for f in os.listdir(directory) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.jfif' ))]

# Process each image file
for image_file in image_files:
    # Read the image
    image_path = os.path.join(directory, image_file)
    image = cv2.imread(image_path)
    
    # Convert the image to greyscale
    grey_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply CLAHE (Contrast-Limited Adaptive Histogram Equalization)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))  # Set CLAHE parameters
    equalised_image = clahe.apply(grey_image)  # Apply CLAHE

    # Apply Otsu's thresholding to enhance lung regions
    _, thresholded = cv2.threshold(equalised_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Find contours
    contours, _ = cv2.findContours(thresholded, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:  # Ensure at least one contour is found
        # Assume the largest external contour is the object to keep
        largest_contour = max(contours, key=cv2.contourArea)
        x, y, w, h = cv2.boundingRect(largest_contour)
        
        # Crop the image using the dimensions of the bounding rectangle
        crop = equalised_image[y:y+h, x:x+w]
        
        # Resize image to 224x224 for deep learning models
        resized_crop = cv2.resize(crop, (224, 224))

        # Normalize pixel values to range [0,1]
        normalized_image = resized_crop / 255.0

        # Create new alpha channel with same dimensions as cropped image
        alpha_channel = numpy.ones(resized_crop.shape[:2], dtype='uint8') * 255
        
        # Add alpha channel to cropped image
        rgba = cv2.merge((*cv2.split(resized_crop), alpha_channel))

        # Create the CSV file name by removing the image extension and adding .csv
        csv_file_name = os.path.splitext(image_file)[0] + '.csv'
        csv_file_path = os.path.join(directory, csv_file_name)

        # Open the CSV file and write the headers and greyscale values
        with open(csv_file_path, 'w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(["Greyscale"])

            # Write the greyscale values for each pixel
            for i in range(resized_crop.shape[0]):
                for j in range(resized_crop.shape[1]):
                    grey_value = resized_crop[i, j]
                    writer.writerow([grey_value])

print("All test files processed.")

In [None]:
# Directory containing the CSV files
summary_file_path = os.path.join(directory, "test summary means.csv")

# Initialise a list to store the summary data
summary_data = []

# Process each CSV file in the directory
for csv_file in os.listdir(directory):
    if csv_file.endswith('.csv'):
        csv_file_path = os.path.join(directory, csv_file)
        with open(csv_file_path, 'r', newline='') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header

            # Initialise the sum and count variables
            grey_sum = 0
            count = 0

            # Sum greyscale values
            for row in reader:
                if row:  # Check if the row is not empty
                    grey_value = float(row[0])  # There's only one greyscale value per row
                    grey_sum += grey_value
                    count += 1

            # Calculate the mean greyscale value
            grey_mean = grey_sum / count if count else 0

            # Add the calculated mean to the summary list with the file name as "Building" column
            summary_data.append([grey_mean, "Test", os.path.splitext(csv_file)[0]])

# Write the summary data to a new CSV file with the appropriate headers
with open(summary_file_path, 'w', newline='') as summary_file:
    writer = csv.writer(summary_file)
    writer.writerow(["Greyscale Mean", "Category", "File Name"])
    for data in summary_data:
        writer.writerow(data)

print("Summary of means for test has been written.")

In [None]:
# List of summary CSV files
summary_files = [
    "C:/images/Normal/normal summary means.csv",
    "C:/images/Pneumonia/pneumonia summary means.csv"
]
# Directory to store the training CSV file
combined_summary_file_path = "C:/images/combined summary means.csv"

# Write the 2D array (list of lists) to a CSV file.
with open(combined_summary_file_path, 'w', newline='') as combined_file:
    writer = csv.writer(combined_file)

    # Process each summary CSV file
    for i, summary_file in enumerate(summary_files):
        with open(summary_file, 'r', newline='') as file:
            reader = csv.reader(file)
            if i == 0:  # If it's the first file, write the header
                writer.writerow(next(reader))
            else:
                next(reader)  # Skip the header for the rest of the files
            writer.writerows(reader)  # Write the data

print("Combined summary has been written.")

In [None]:
# Summarise the data
url = "C:/images/combined summary means.csv"
names = ['Greyscale Mean', 'category']  # Adjusting for greyscale
dataset = pandas.read_csv(url)

In [None]:
# Shape
print(dataset.shape)

In [None]:
# Head
print(dataset.head(5803))

In [None]:
# descriptions
print(dataset.describe())

In [None]:
# Class distribution
print(dataset.groupby('Category').size())

In [None]:
# Box and whisker plots - adjusted for single feature
dataset.plot(kind='box', y='Greyscale Mean', title='Box Plot of Greyscale Means')
plt.show()

In [None]:
# Histograms - adjusted for single feature
dataset['Greyscale Mean'].hist()
plt.title('Histogram of Greyscale Means')
plt.show()

In [None]:
# Split-out validation dataset THIS IS AN IMPORTANT STEP
array = dataset.values
X = array[:,0:1]  # Adjusted to select only the greyscale column
y = array[:,1]
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.2, random_state=1) #THE .20 REFERS TO THE PERCENTAGE OF THE FILES WHICH WILL BE VALIDATED

In [None]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))  # BEST FIT MODEL FOR BINARY DISTRIBUTION
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))  # BUILDS USING ON VALUES TO FIND THE N VALUE BETWEEN CLUSTERS
models.append(('CART', DecisionTreeClassifier())) # BEST DECISION IN DECISON TREE SELECTS THE BEST
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=3, random_state=1, shuffle=True)
	cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

In [None]:
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

In [None]:
model = SVC(gamma='auto')
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

In [None]:
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))

In [None]:
directory = ['C:/images/TEST SET/test summary means.csv']

# Load each CSV file and concatenate into one DataFrame [6]
unseen_data_frames = [pandas.read_csv(f) for f in directory]
data = pandas.concat(unseen_data_frames, ignore_index=True)

# Select test criteria [7][8]
features_for_prediction = data.iloc[:, [0]].values

# Select the trained model [9]
predictions = model.predict(features_for_prediction)

# Print predictions for each unseen image
for i, prediction in enumerate(predictions):
    print(f"Unseen image {i} predicted as: {prediction}")