In [2]:
import matplotlib.pyplot as plt
import numpy as np
import PIL
import tensorflow as tf
import os
import pandas as pd
import cv2

from PIL import Image
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.applications.resnet50 import preprocess_input, ResNet50
from tensorflow.keras.preprocessing import image as keras_image

import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.models as models
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [3]:
DATA_DIR = 'oral cancer.v3i.tensorflow'
TEST_DIR = os.path.join(DATA_DIR, 'test')
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
VALID_DIR = os.path.join(DATA_DIR, 'valid')

test_annotations = pd.read_csv(os.path.join(TEST_DIR, '_annotations.csv'))
train_annotations = pd.read_csv(os.path.join(TRAIN_DIR, '_annotations.csv'))
valid_annotations = pd.read_csv(os.path.join(VALID_DIR, '_annotations.csv'))

In [6]:
def filter_annotations(annotations, img_dir):
    """
    Filters the annotations to include only images present in the specified directory.
    
    Args:
        annotations (pandas.DataFrame): The annotations DataFrame.
        img_dir (str): The directory path containing the images.
        
    Returns:
        pandas.DataFrame: The filtered annotations DataFrame.
    """
    image_files = [f for f in os.listdir(img_dir)]
    annotations = annotations[annotations['filename'].isin(image_files)]
    annotations = annotations.drop_duplicates(subset=['filename'])
    return annotations

test_annotations = filter_annotations(test_annotations, os.path.join(TEST_DIR))
train_annotations = filter_annotations(train_annotations, os.path.join(TRAIN_DIR))
valid_annotations = filter_annotations(valid_annotations, os.path.join(VALID_DIR))

In [7]:
print(len(test_annotations))
print(len(train_annotations))
print(len(valid_annotations))

423
6411
603


In [8]:
def separate_images(annotations):
    """
    Separates the images into cancerous and non-cancerous groups based on the annotations.

    Args:
        annotations (pandas.DataFrame): The annotations DataFrame.

    Returns:
        tuple: A tuple containing two lists:
            cancerous (list): A list of cancerous image paths.
            non_cancerous (list): A list of non-cancerous image paths.
    """
    cancerous = []
    non_cancerous = []

    for _, row in annotations.iterrows():
        image_path = os.path.join(row['filename'])
        if row['class'] == "cancer":  # Cancerous
            cancerous.append(image_path)
        elif row["class"] == "no cancer":  # Non-cancerous
            non_cancerous.append(image_path)

    return cancerous, non_cancerous

test_cancerous, test_non_cancerous = separate_images(test_annotations)
train_cancerous, train_non_cancerous = separate_images(train_annotations)
valid_cancerous, valid_non_cancerous = separate_images(valid_annotations)
print(len(test_cancerous), len(test_non_cancerous))
print(len(train_cancerous), len(train_non_cancerous))
print(len(valid_cancerous), len(valid_non_cancerous))

203 220
3226 3185
295 308


In [None]:
cancerous_image_names = test_cancerous[:100]# + valid_cancerous + train_cancerous
non_cancerous_image_names = test_non_cancerous[:100]# + valid_non_cancerous + train_non_cancerous

In [9]:
cancerous_image_paths = [os.path.join(TEST_DIR, img) for img in test_cancerous] + [os.path.join(VALID_DIR, img) for img in valid_cancerous]# + [os.path.join(TRAIN_DIR, img) for img in train_cancerous] 
non_cancerous_image_paths = [os.path.join(TEST_DIR, img) for img in test_non_cancerous] + [os.path.join(VALID_DIR, img) for img in valid_non_cancerous]# + [os.path.join(TRAIN_DIR, img) for img in train_non_cancerous]

In [10]:
import os
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from skimage import exposure, filters, restoration, transform, util
from skimage.restoration import denoise_nl_means, estimate_sigma
from skimage.util import random_noise

# Helper function to load and preprocess an image
def load_preprocess_image(image_path, target_size=(224, 224)):
    image = Image.open(image_path)

    # Resize and pad to maintain aspect ratio
    image = np.array(image)
    max_side = max(image.shape[:2])
    delta = max_side - min(image.shape[:2])
    top, bottom = delta // 2, delta - (delta // 2)
    left, right = 0, 0
    if image.shape[0] < image.shape[1]:
        left, right = delta // 2, delta - (delta // 2)
    image = np.pad(image, [(top, bottom), (left, right), (0, 0)], mode='constant')
    image = transform.resize(image, target_size, anti_aliasing=True)

    # Data augmentation: Random rotations and flips
    if np.random.rand() < 0.5:
        image = np.flip(image, axis=0)  # Vertical flip
    if np.random.rand() < 0.5:
        image = np.flip(image, axis=1)  # Horizontal flip
    angle = np.random.randint(-20, 20)
    image = transform.rotate(image, angle, preserve_range=True)

    # Normalize using mean and std
    image = (image - np.mean(image)) / np.std(image)

    # Clip pixel values to the valid range
    image = np.clip(image, -1.0, 1.0)

    # Contrast enhancement using CLAHE
    image = exposure.equalize_adapthist(image, clip_limit=0.03)

    # Denoising using Non-Local Means
    sigma_est = np.mean(estimate_sigma(image, channel_axis=-1))
    patch_kw = dict(patch_size=5, patch_distance=6, channel_axis=-1)
    image = denoise_nl_means(image, h=0.6 * sigma_est, sigma=sigma_est, fast_mode=True, **patch_kw)

    return image

# Load and preprocess all images
cancerous_images = [load_preprocess_image(path) for path in cancerous_image_paths]
non_cancerous_images = [load_preprocess_image(path) for path in non_cancerous_image_paths]

# Combine images and labels
X = cancerous_images + non_cancerous_images
y = [1] * len(cancerous_images) + [0] * len(non_cancerous_images)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB7
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Data augmentation and preprocessing
data_generator = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input
)

# Load the pre-trained EfficientNetB7 model without the top classification layers
base_model = EfficientNetB7(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model
base_model.trainable = False

# Build the model
model = Sequential([
    base_model,
    GlobalAveragePooling2D(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])


In [None]:

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall', 'AUC'])

# Define callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', save_best_only=True, verbose=1, save_weights_only=False)


# Augment training data
train_generator = data_generator.flow(np.array(X_train), np.array(y_train), batch_size=32)# Train the model
history = model.fit(
    train_generator,
    epochs=50,
    validation_data=(np.array(X_test), np.array(y_test)),
    callbacks=[early_stop, model_checkpoint]
)


In [20]:
# Load the best model
import time 

best_model = tf.keras.models.load_model('best_model.keras')

best_model.summary()

# Evaluate on the test set
y_pred = (best_model.predict(np.array(X_test)) > 0.5).astype(int).ravel()
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy*100:.2f}%")
print(f"Precision: {precision*100:.2f}%")
print(f"Recall: {recall*100:.2f}%")
print(f"F1-score: {f1*100:.2f}%")

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 7s/step
Accuracy: 45.63%
Precision: 45.90%
Recall: 54.90%
F1-score: 50.00%


In [None]:

def extract_features(image_paths, batch_size=32):
    """
    Extracts features from a list of image paths using a pre-trained CNN.
    
    Args:
        image_paths (list): A list of paths to the input images.
        batch_size (int): The batch size for feature extraction.
        
    Returns:
        features (torch.Tensor): A tensor containing the extracted features for all images.
    """
    # Define data preprocessing and augmentation steps
    data_transforms = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    # Load the pre-trained ResNet-50 model
    model = models.resnet50(pretrained=True)
    model.eval()
    
    # Create a feature extractor by removing the final classification layer
    feature_extractor = nn.Sequential(*list(model.children())[:-1])
    
    # Initialize lists to store features and image tensors
    features = []
    image_tensors = []
    
    # Loop over the image paths and extract features
    for path in image_paths:
        image = Image.open(path)
        image_tensor = data_transforms(image).unsqueeze(0)
        image_tensors.append(image_tensor)
        
        # Extract features in batches for efficiency
        if len(image_tensors) == batch_size:
            batch_tensor = torch.cat(image_tensors, dim=0)
            batch_features = feature_extractor(batch_tensor)
            features.append(batch_features.squeeze())
            image_tensors = []
    
    # Extract features for the remaining images
    if image_tensors:
        batch_tensor = torch.cat(image_tensors, dim=0)
        batch_features = feature_extractor(batch_tensor)
        features.append(batch_features.squeeze())
    
    # Concatenate all features into a single tensor
    features = torch.cat(features, dim=0)
    
    return features

In [None]:
# Extract features from cancerous and non-cancerous images
cancerous_features = extract_features(cancerous_image_paths)
non_cancerous_features = extract_features(non_cancerous_image_paths)

In [None]:
# Combine features and labels
all_features = torch.cat([cancerous_features, non_cancerous_features], dim=0)
all_labels = torch.tensor([1] * len(cancerous_image_paths) + [0] * len(non_cancerous_image_paths))

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(all_features, all_labels, test_size=0.2, random_state=42)


In [None]:
from torchvision.transforms import v2
# Define data augmentation and preprocessing steps
data_transforms = transforms.Compose([
    v2.ToPILImage(),
    v2.RandomHorizontalFlip(p=0.5),
    v2.RandomVerticalFlip(p=0.5),
    v2.RandomRotation(degrees=20),
    v2.ToTensor(),
    v2.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Apply data augmentation to training data
X_train = [data_transforms(img.view(1, 1, 2048)) for img in X_train]

In [None]:
# Train the SVM classifier
model = SVC(kernel='linear', probability=True)
model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')