In [1]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import pickle




In [2]:
def determine_disease(filename):
    """
    Check to see if image is TB or not.
    Input: name of image (last str position is label)
    Output: Boolean value of disease or not
    """
    if filename[-5] == '1':
        return True
    else:
        return False
    
def increase_contrast(image, box_size=8, lim=2):
    """
    Increase contrast of image
    Input: Image
    Optional: box size and clip limit- can finetune these
    Output: contrast increased image  
    """
    clahe = cv2.createCLAHE(clipLimit=lim, tileGridSize=(box_size, box_size))
    return clahe.apply(image)

def extraction(file, data_dict):
    """ 
    Function to extract Harris corners, sift keypoints, and threshold images and store results in data dictionaries
    Inputs: image, path to file to determine label
    outputs: dictionary containing the metrics
    """
    image = cv2.imread(file, cv2.IMREAD_GRAYSCALE)
    
    # increase the contrast
    cont_image = increase_contrast(image)
    resized_img = cv2.resize(cont_image, (995, 750))
    channeled_img = np.stack((resized_img,) * 3, axis=-1)
    data_dict['image'].append(channeled_img)
    
    # store rest of the data in the dictionary
    if determine_disease(file):
        data_dict['label'].append(1)
    else:
        data_dict['label'].append(0)

In [5]:
def build_dictionaries(path):

    # store all of the filenames in a list
    filenames = [ ]
    for root, _, files in os.walk(path): 
        for file in files:
            if file.endswith('.png'):
                path_name = os.path.join(root, file) 
                filenames.append(path_name)
                
    # create random permutation of indices
    indices = np.random.permutation(len(filenames))

    # split into 80/20
    split = int(len(filenames) * 0.8)
    train_indices = indices[:split]
    test_indices = indices[split:]

    # split the filenames
    train_filenames = [filenames[i] for i in train_indices]
    test_filenames = [filenames[i] for i in test_indices]

    print("Training set size:", len(train_filenames))
    print("Testing set size:", len(test_filenames))
    
    data_train =  {'label' : [], 'image' : []}
    data_test  =  {'label' : [], 'image' : []}

    # populate train and test dictionaries separately 
    split_dicts = ['train', 'test']
    for split_type in split_dicts:
        # build training dictionary
        if split_type == 'train':
            for idx, filename in enumerate(train_filenames):
                print(f'Train image: {idx}')
                extraction(filename, data_train) 
                
        # build testing dictionary
        else:
            for idx, filename in enumerate(test_filenames):
                print(f'Test image: {idx}')
                extraction(filename, data_test)
                
    data_train['image'] = np.array(data_train['image'])
    data_train['label'] = np.array(data_train['label'])
    data_test['image'] = np.array(data_test['image'])
    data_test['label'] = np.array(data_test['label'])
    return data_train, data_test


''' 
pickle data dictionaries to not have to extract features multiple times
'''
def pickle_out(file, data):
    # Save to a pickle file
    with open(file, 'wb') as f:
        pickle.dump(data, f)
        
        
def pickle_in(file):
    # Load from a pickle file
    with open(file, 'rb') as f:
        data = pickle.load(f)

    return data

In [4]:
""" 
SET UP EXPERIMENTAL DESIGN HERE
Choose if you want to load data directly in from pickle file or make dicts by starting feature extraction
"""
make_dicts = False
want_to_pickle = False

In [6]:
if make_dicts:
    path = 'C:/Users/jjfan/OneDrive - University of Illinois Chicago/Documents/Computer Science/CS 415/Project/clean_images'
    data_train_loaded, data_test_loaded = build_dictionaries(path)
    if want_to_pickle:
        pickle_out('data_train_2.pickle', data_train_loaded)
        pickle_out('data_test_2.pickle', data_test_loaded)
else:
    data_train_loaded = pickle_in('data_train_2.pickle')
    data_test_loaded = pickle_in('data_test_2.pickle')

In [7]:
print(data_train_loaded['image'].shape)

(529, 750, 995, 3)


In [8]:
base_model = tf.keras.applications.VGG16(input_shape=(750, 995, 3), include_top=False, weights='imagenet')
base_model.trainable = False





In [7]:
model = models.Sequential([
    base_model,
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# # Build the CNN model
# model = models.Sequential()

# model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(1500, 1946, 1)))
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(64, (3, 3), activation='relu'))
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Conv2D(128, (3, 3), activation='relu'))
# model.add(layers.MaxPooling2D((2, 2)))

# model.add(layers.Flatten())
# model.add(layers.Dense(512, activation='relu'))
# model.add(layers.Dense(1, activation='sigmoid'))  # Sigmoid for binary classification





In [8]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(data_train_loaded['image'], data_train_loaded['label'], epochs=10, batch_size=32, validation_data=(data_test_loaded['image'], data_test_loaded['label']))


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2a7b3d79610>

In [7]:
with open('model_1.pickle', 'rb') as f:
    model = pickle.load(f)






In [8]:
# Evaluate the model
test_loss, test_acc = model.evaluate(data_test_loaded['image'], data_test_loaded['label'])
print(f'Test accuracy: {test_acc}')



Test accuracy: 0.8721804618835449


In [9]:
y_pred = model.predict(data_test_loaded['image'])

y_pred_binary = (y_pred > 0.5).astype(int)

report = classification_report(data_test_loaded['label'], y_pred_binary)

print(report)

              precision    recall  f1-score   support

           0       0.82      0.94      0.87        62
           1       0.94      0.82      0.87        71

    accuracy                           0.87       133
   macro avg       0.88      0.88      0.87       133
weighted avg       0.88      0.87      0.87       133



In [13]:
with open('model_1.pickle', 'wb') as f:
        pickle.dump(model, f)

In [9]:
model2 = models.Sequential([
    base_model,
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Recall()])




In [10]:
# Train the model
model2.fit(data_train_loaded['image'], data_train_loaded['label'], epochs=5, batch_size=32, validation_data=(data_test_loaded['image'], data_test_loaded['label']))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1eb90347650>

In [10]:
with open('model_2.pickle', 'rb') as f:
    model2 = pickle.load(f)

In [17]:
# Evaluate the model
test_loss, test_acc, test_recall = model2.evaluate(data_test_loaded['image'], data_test_loaded['label'])
print(f'Test accuracy: {test_acc}\nTest recall: {test_recall}')

Test accuracy: 0.8646616339683533
Test recall: 0.8309859037399292


In [18]:
y_pred = model2.predict(data_test_loaded['image'])

y_pred_binary = (y_pred > 0.5).astype(int)

report = classification_report(data_test_loaded['label'], y_pred_binary)

print(report)

              precision    recall  f1-score   support

           0       0.82      0.90      0.86        62
           1       0.91      0.83      0.87        71

    accuracy                           0.86       133
   macro avg       0.87      0.87      0.86       133
weighted avg       0.87      0.86      0.86       133



In [19]:
with open('model_2.pickle', 'wb') as f:
    pickle.dump(model2, f)