# Pre Processing

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score
from sklearn.impute import SimpleImputer

### pre_processing data

In [None]:
path =r"C:\Users\PC\Documents\train-metadata.csv"
data = pd.read_csv(path)

#debugging information to check the imbalance in the data
before = data['target'].value_counts()

majority = data[data['target'] == 0]
minority = data[data['target'] == 1]
#fix the imbalance by upsamling the minority class to match the majority
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42) 
data = pd.concat([majority, minority_upsampled]) #combine them to the final result

#check the 2 classes after fixing the imbalance
after = data['target'].value_counts()

print(f"Data distribution before balancing: {before}")
print(f"Data distribution after balancing: {after}")

In [None]:
#drop unnecessary columns that are irrelevant or repetitive 
data.drop(columns=['isic_id', 'patient_id', 'image_type', 'tbp_tile_type', 'copyright_license', 
                   'lesion_id', 'iddx_full', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5', 
                   'mel_mitotic_index', 'mel_thick_mm'], inplace=True)
data.head(5)

In [None]:
numeric = [col for col in data.select_dtypes(include=['float64', 'int64']).columns if col != 'target'] #identify numeric columns
categorical = data.select_dtypes(include=['object']).columns #identify categorical columns
imputer_num = SimpleImputer(strategy='mean')  #create instance of imputer with mean to fill the missing data
imputer_cat = SimpleImputer(strategy='most_frequent')  #create instance of imputer with mode to fill the missing data

data[numeric] = imputer_num.fit_transform(data[numeric])
data[categorical] = imputer_cat.fit_transform(data[categorical])
print(data.isnull().sum()) #verify handling all missing values

____________________________________________________________________________________________________________________
All missing values have been handled, and the dataset is now fully preprocessed and ready for analysis or modeling
____________________________________________________________________________________________________________________

### visuallization

In [None]:
#count plot shows comparison between age distribution against having cancer

plt.figure(figsize=(5, 3))
sns.countplot(x='sex', hue='target', data=data, palette='Set3')
plt.title('Count Plot of Age against Having Cancer')
plt.xlabel('sex')
plt.ylabel('Count')
plt.legend(title='target')
plt.show()

In [None]:
#scatter plot compares anatomical site against age, and its effect on having cancer
plt.figure(figsize=(8, 5))
sns.scatterplot(
    data=data,
    x='anatom_site_general',
    y='age_approx',
    hue='target',
    palette='coolwarm')
plt.title('Age vs Anatomical Site')
plt.xlabel('Age')
plt.ylabel('Anatomical Site')
plt.show()

In [None]:
#histogram distribution of the anatomical site locations, highlighting the frequency of each site in the dataset.
plt.figure(figsize=(10, 3))
plt.hist(data['tbp_lv_location'], bins=20)
plt.title('Histogram Distribution of Anatomical Site')
plt.xlabel('anatom site general')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

____________________________________________________________________________________________________________________
We used various plot types (count plot, box plot, and histogram) to visualize different aspects of the data, 
which helped us identify patterns in age distribution, gender differences, 
and the distribution of anatomical sites across the dataset.
____________________________________________________________________________________________________________________

## cleaning data (normalizaion and dropping)

In [None]:
#scaling all numeric values to be within [0,1] range for faster convergence
scaler = MinMaxScaler()
data[numeric] = scaler.fit_transform(data[numeric])
data[numeric].head()

In [None]:
#convert all categorical data to numeric
encoder = LabelEncoder()
for col in categorical:
    data[col] = encoder.fit_transform(data[col])

data[categorical]

In [None]:
#allocate the X and y data

X = data.iloc[ : , 1: ]
y = data.iloc[ : , 0]

In [None]:
#prepare data for model using pytorch type
X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

#  Models

### NN model

In [None]:
#split the data into %85 training and %25 testing datasets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
#convert the dataset into Dataloader for the model to train on it

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=400, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=400)

In [None]:
#simple fully connected neural network
class SimpleNN(nn.Module):
    def __init__(self, n):
        #only 3 layers (one hidden) and a drop rate, all to prevent overfitting
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(n, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(torch.relu(self.fc2(x)))
        x = self.fc3(x) #output layer, outputs raw logits
        return x

n = X.shape[1] #number of features (columns)
model = SimpleNN(n)

criterion = nn.BCEWithLogitsLoss() #uses sigmoid activation function, and binary cross entropy loss function
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

In [None]:
#begin training the model

epochs = 20
for epoch in range(epochs):

    #initialze the training loss measurement
    training_loss = 0 
    total = 0
    correct = 0
    model.train() #set the model in train mode
    
    for X_batch, y_batch in train_loader: #using the batches from loader
        
        optimizer.zero_grad()#clear gradient from previous step
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward() #compute gradient by backpropagation
        optimizer.step() #update the parameters accordingly
        training_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

    training_loss /= len(train_loader)
        
    print(f"Epoch {epoch+1} -> Training Loss: {training_loss:.6f}")

In [None]:
#begin testing the model

model.eval() #set the model to evaluation mode
with torch.no_grad(): #disable gradient computing during testing
    #create list for the prediction and actual result and intialize the loss to 0
    test_loss = 0
    y_true = []
    y_pred = []

    for X_batch, y_batch in test_loader: #using the batches from loader
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        test_loss += loss.item() #keep on adding the loss from the iteration

        #compute the predicted y and its corresponding actual y
        predictions = (torch.sigmoid(outputs) > 0.5).float()
        y_true.extend(y_batch.numpy())
        y_pred.extend(predictions.numpy())

    #compute evaluation criteria 
    test_loss /= len(test_loader)#calculate the average of the loss function
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

print(f'Average test loss: {test_loss:.8f}')
print(f'Accuracy: %{accuracy * 100}')
print(f'F1 score: %{f1 * 100}')

###  Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#initialize r=classifier random forest 
clf = RandomForestClassifier(
    n_estimators=70,
    max_depth=4,
    random_state=42
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)

#predict and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: %{accuracy * 100}')

### CNN model 

#### pre_processing images and visualization

In [None]:
import cv2
import numpy as np
from PIL import Image, ImageEnhance
import matplotlib.pyplot as plt

#### Subset of the images processed to be used in the model

In [None]:
import cv2
import numpy as np
import os

def load_image(image_path):
    return cv2.imread(image_path)

#resize the image for model training later
def resize_image(image, size=(224, 224)):
    return cv2.resize(image, size)

#apply CLAHE to enhance contrast in images
def apply_clahe(image):
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    lab = cv2.merge((cl, a, b))
    return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)

#remove hair using morphology operations
def remove_hair(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (17, 17))
    blackhat = cv2.morphologyEx(gray, cv2.MORPH_BLACKHAT, kernel)
    _, thresh = cv2.threshold(blackhat, 10, 255, cv2.THRESH_BINARY)
    dst = cv2.inpaint(image, thresh, 1, cv2.INPAINT_TELEA)
    return dst

#adjust brightness using gamma correction
def adjust_gamma(image, gamma=1.3):
    invGamma = 1.0 / gamma
    table = np.array([(i / 255.0) ** invGamma * 255 for i in range(256)]).astype("uint8")
    return cv2.LUT(image, table)

#sharpen the image
def sharpen_image(image):
    blurred = cv2.GaussianBlur(image, (0, 0), sigmaX=55, sigmaY=55)
    sharpened = cv2.addWeighted(image, 1.5, blurred, -0.5, 0)
    return sharpened

#function to apply all the preprocessing technique used an image
def preprocess_image(image_path):
    image = load_image(image_path)
    if image is None:
        return None
    resized_image = resize_image(image)
    clahe_image = apply_clahe(resized_image)
    hair_removed_image = remove_hair(clahe_image)
    gamma_corrected_image = adjust_gamma(hair_removed_image)
    sharpened_image = sharpen_image(gamma_corrected_image)
    return sharpened_image

#function to preprocess all the images
def process_image_folder(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        file_path = os.path.join(input_dir, filename)
        if os.path.isfile(file_path):
            processed_image = preprocess_image(file_path)
            if processed_image is not None:
                output_path = os.path.join(output_dir, filename)
                cv2.imwrite(output_path, processed_image, [cv2.IMWRITE_JPEG_QUALITY, 85])
                print(f"Processed and saved: {filename}")

    print(f"All images in {input_dir} have been processed and saved to {output_dir}")

input_folder =  r"C:\Users\PC\Documents\isic-2024-challenge\train-image\image" #path to the training images
output_folder = r"C:\Users\PC\Documents\isic-2024-challenge\train-image\trained_im_preprocessed"

process_image_folder(input_folder, output_folder)

#### Creating a dataset consisting of image titles and their corresponding labels only

In [None]:
df= pd.read_csvr("C:\Users\PC\Documents\train-metadata.csv\train-metadata.csv")
df.info(verbose = False, memory_usage = 'deep')

req_cols= ['isic_id', 'target']
df= pd.read_csv("C:\Users\PC\Documents\isic-2024-challenge\train-metadata.csv", usecols= req_cols)
y= df['target']
x = df.drop(['target'], axis= 1)

#### CNN model that deals with images

In [None]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# Load the CSV file
csv_file_path =  r'C:\Users\PC\Documents\isic-2024-challenge\labels_sub.csv'  # Adjust path to your CSV file
df = pd.read_csv(csv_file_path)

# Path to the folder containing the images
images_folder =  r"C:\Users\PC\Documents\isic-2024-challenge\train-image\trained_im_preprocessed"  # Adjust path to your image folder

# Constants
IMG_SIZE = 224  # Resize all images to 224x224
BATCH_SIZE = 32
EPOCHS = 40

# Function to load and preprocess images
def load_images_and_labels(df, image_folder, img_size=IMG_SIZE):
    images = []
    labels = []
    
    for index, row in df.iterrows():
        # Get image filename and label
        img_filename = row['isic_id'] + '.jpg'  # Assuming your image IDs are in the column 'isic_id'
        img_path = os.path.join(image_folder, img_filename)
        
        # Load image
        img = load_img(img_path, target_size=(img_size, img_size))
        img = img_to_array(img) / 255.0  # Normalize image to range [0, 1]
        
        # Append image and label
        images.append(img)
        labels.append(row['target'])  # Assuming labels are in 'target' column (0 or 1)
    
    return np.array(images), np.array(labels)

# Load images and labels
X, y = load_images_and_labels(df, images_folder)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert labels to categorical (if necessary, for multi-class classification)
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

# Build the CNN model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')  # 2 output classes: benign (0) and malignant (1)
])

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_test, y_test))

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Save the model if needed
model.save('cnn_skin_lesion_model.h5')


# Comparison 