In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'tuberculosis-tb-chest-xray-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F891819%2F2332307%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240918%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240918T164910Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D6eddedb5ec48a0c71978c9edab4e438f9a4a43d50304d4a76c58d155d0d1fde7979ce35eb0cc139a9cd14c9a7b83c3483e9c62ca63bac72a1fe2ea53385552b9883b9dd2012e209928786e7b834d2747faf57e4b8b611734f9d4d9699bfd12ceee408cdcd9bcff9efe777445f0861c9108e501a11db5ae9e867c8d4ccde8dbe218805d46c05a0ccb7b05322cc44143ff140dd7f0db353f35c9b9aa5bb84bb420c4f2fac9d165f27b07fb4579b1c38b844cdbc3756fda0ff6bc013092d658ff61971ced824240ef2d95da924808c90d8028da3fb06ba0cb5e7188729c02261397faf616b7becba9b07de7232adc0bb733b462f97debd77fac0e297f0eaa823cff'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading tuberculosis-tb-chest-xray-dataset, 695602161 bytes compressed
Downloaded and uncompressed: tuberculosis-tb-chest-xray-dataset
Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Normal.metadata.xlsx
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/README.md.txt
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis.metadata.xlsx
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-343.png
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-245.png
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-48.png
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-280.png
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-592.png
/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database/Tuberculosis/Tuberculosis-332.png
/kaggle/input/tuberculos

In [3]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Concatenate


In [4]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import shutil

# Set paths
data_dir = '/kaggle/input/tuberculosis-tb-chest-xray-dataset/TB_Chest_Radiography_Database'
output_dir = '/kaggle/working/split_data'  # Temporary directory to hold split data

# Define the classes
classes = ['Normal', 'Tuberculosis']

# Create directories for train/val/test
for split in ['train', 'val', 'test']:
    for class_name in classes:
        os.makedirs(os.path.join(output_dir, split, class_name), exist_ok=True)

# Get all image paths and labels
image_paths = []
labels = []

for class_name in classes:
    class_dir = os.path.join(data_dir, class_name)
    for image_name in os.listdir(class_dir):
        image_paths.append(os.path.join(class_dir, image_name))
        labels.append(class_name)

# Split the data into train, validation, and test sets
train_paths, test_paths, train_labels, test_labels = train_test_split(image_paths, labels, test_size=0.2, stratify=labels, random_state=42)
train_paths, val_paths, train_labels, val_labels = train_test_split(train_paths, train_labels, test_size=0.25, stratify=train_labels, random_state=42)  # 0.25 of 0.8 = 0.2

# Copy the images to the corresponding directories
def copy_images(file_paths, labels, split):
    for file_path, label in zip(file_paths, labels):
        shutil.copy(file_path, os.path.join(output_dir, split, label))

copy_images(train_paths, train_labels, 'train')
copy_images(val_paths, val_labels, 'val')
copy_images(test_paths, test_labels, 'test')


In [5]:
# Image data generator for augmentation
datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    brightness_range=[0.8, 1.2],
    fill_mode='nearest'
)

# Load train, validation, and test data
train_data = datagen.flow_from_directory(
    '/kaggle/working/split_data/train',
    target_size=(224, 224),
    batch_size=8,
    class_mode='categorical'
)

validation_data = datagen.flow_from_directory(
    '/kaggle/working/split_data/val',
    target_size=(224, 224),
    batch_size=8,
    class_mode='categorical'
)

test_data = datagen.flow_from_directory(
    '/kaggle/working/split_data/test',
    target_size=(224, 224),
    batch_size=8,
    class_mode='categorical',
    shuffle=False
)


Found 2520 images belonging to 2 classes.
Found 840 images belonging to 2 classes.
Found 840 images belonging to 2 classes.


In [6]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0, DenseNet121
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Concatenate, Reshape, MultiHeadAttention, LayerNormalization

# Input layer
input_shape = (224, 224, 3)
inputs = tf.keras.Input(shape=input_shape)

# EfficientNetB0 feature extraction
efficient_net = EfficientNetB0(include_top=False, weights='imagenet', input_tensor=inputs)
efficient_net_output = GlobalAveragePooling2D()(efficient_net.output)

# DenseNet121 feature extraction
dense_net = DenseNet121(include_top=False, weights='imagenet', input_tensor=inputs)
dense_net_output = GlobalAveragePooling2D()(dense_net.output)

# Concatenate features from both networks
concatenated_features = Concatenate()([efficient_net_output, dense_net_output])

# Add Dropout for regularization
concatenated_features = Dropout(0.3)(concatenated_features)

# Reshape concatenated_features for MultiHeadAttention
# MultiHeadAttention expects input shape (batch_size, seq_len, feature_dim)
# Add a new dimension for seq_len (e.g., 1)
reshaped_features = Reshape((1, -1))(concatenated_features)

# Multi-head attention layer
attention_output = MultiHeadAttention(num_heads=4, key_dim=128)(reshaped_features, reshaped_features)
attention_output = LayerNormalization(epsilon=1e-6)(attention_output)

# Flatten the attention output
flattened_output = tf.keras.layers.Flatten()(attention_output)

# Add a fully connected layer
x = Dense(512, activation='relu')(flattened_output)

# Final classification layer
outputs = Dense(2, activation='softmax')(x)

# Create the model
model = tf.keras.Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [7]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Define the class labels (normal = 0, tuberculosis = 1)
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(train_data.classes),
                                     y=train_data.classes)

# Convert to a dictionary format for Keras
class_weights = dict(enumerate(class_weights))
print("Class weights:", class_weights)


Class weights: {0: 0.6, 1: 3.0}


In [8]:
# Train the model with class weights
history = model.fit(train_data,
                    validation_data=validation_data,
                    epochs=5,
                    class_weight=class_weights)


Epoch 1/5


  self._warn_if_super_not_called()


[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 364ms/step - accuracy: 0.7774 - loss: 1.1610 - val_accuracy: 0.3881 - val_loss: 0.8385
Epoch 2/5
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 272ms/step - accuracy: 0.9361 - loss: 0.1801 - val_accuracy: 0.8679 - val_loss: 0.3253
Epoch 3/5
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 269ms/step - accuracy: 0.9506 - loss: 0.1649 - val_accuracy: 0.9762 - val_loss: 0.0656
Epoch 4/5
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 274ms/step - accuracy: 0.9544 - loss: 0.1935 - val_accuracy: 0.1952 - val_loss: 1.1779
Epoch 5/5
[1m315/315[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 269ms/step - accuracy: 0.9431 - loss: 0.2175 - val_accuracy: 0.9667 - val_loss: 0.0899


In [9]:
model.save("classNormalization_attention_mechanism.h5")



In [10]:
# Evaluate on test data
test_loss, test_acc = model.evaluate(test_data)
print(f'Test Loss: {test_loss}, Test Accuracy: {test_acc}')

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 176ms/step - accuracy: 0.9924 - loss: 0.0264
Test Loss: 0.1134900152683258, Test Accuracy: 0.9607142806053162


In [11]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Evaluate on the test data
test_loss, test_acc = model.evaluate(test_data)

# Predict on test data to get confusion matrix, precision, recall, etc.
y_true = test_data.classes
y_pred_probs = model.predict(test_data)
y_pred = y_pred_probs.argmax(axis=1)

# Confusion Matrix
print(confusion_matrix(y_true, y_pred))

# Classification Report (precision, recall, f1-score)
print(classification_report(y_true, y_pred, target_names=['Normal', 'Tuberclosis']))

# AUC score
auc = roc_auc_score(y_true, y_pred_probs[:, 1])
print(f"AUC: {auc}")

[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 185ms/step - accuracy: 0.9962 - loss: 0.0144




[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 160ms/step
[[696   4]
 [ 30 110]]
              precision    recall  f1-score   support

      Normal       0.96      0.99      0.98       700
 Tuberclosis       0.96      0.79      0.87       140

    accuracy                           0.96       840
   macro avg       0.96      0.89      0.92       840
weighted avg       0.96      0.96      0.96       840

AUC: 0.9903877551020408
