# Assemble Model using DenseNet, EfficientNet, ResNet50, XGBoost, Light GBM, and CatBoost

## Dependencies

In [None]:
import numpy as np
import pandas as pd
import h5py
import torch
import cv2
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import os
from torch.utils.data import Dataset
from torchvision import transforms, models
from torchvision.models import densenet121, DenseNet121_Weights
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
# from dataset import HDF5Dataset
from sklearn.metrics import roc_auc_score, roc_curve
from isic_metric import score

## Data Loading

In [None]:
images = []
labels = []

### First, load the malignant data from original database

In [None]:
original_train_hdf5_path = 'train-image.hdf5'
original_train_metadata_path = 'train-metadata.csv'
original_train_metadata = pd.read_csv(original_train_metadata_path,low_memory=False)   
original_train_hdf5 = h5py.File(original_train_hdf5_path, 'r')

In [None]:
for i in tqdm(range(len(original_train_metadata))):
    if original_train_metadata.iloc[i]['target'] == 0: # skip non-malignant images
        continue
    image_id = original_train_metadata.iloc[i]['isic_id']
    image = original_train_hdf5[image_id][()]
    image = np.frombuffer(image, dtype=np.uint8)
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)
    image = cv2.resize(image, (128, 128))
    image = image / 255
    
    images.append(image)
    labels.append(1)
    
# original_train_hdf5.close()

### Second, load the augmented malignant images

In [None]:
augmented_malignant_hdf5_path = 'augmented_data.hdf5'
augmented_malignant_metadata_path = 'augmented_metadata.csv'
augmented_malignant_metadata = pd.read_csv(augmented_malignant_metadata_path,low_memory=False)
augmented_malignant_hdf5 = h5py.File(augmented_malignant_hdf5_path, 'r')
n_augmentations = 5

In [None]:
for i in tqdm(range(len(augmented_malignant_metadata))):

    for j in range(n_augmentations):
        image_id = f"{augmented_malignant_metadata.iloc[i]['isic_id']}_aug{j}"
        image = augmented_malignant_hdf5[image_id][()]
        image = np.frombuffer(image, dtype=np.uint8)
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        # show image
        # plt.imshow(image)
        # plt.axis('off')
        # plt.show()
        image = cv2.resize(image, (128, 128))
        image = image / 255
    
        images.append(image)
        labels.append(1)
    
augmented_malignant_hdf5.close()

### Third, load the ISIC full database's malignant examples

In [None]:
isic_metadata_path = 'isic_metadata.csv'
isic_hdf5_path = 'isic_image.hdf5'
isic_metadata = pd.read_csv(isic_metadata_path,low_memory=False)
isic_hdf5 = h5py.File(isic_hdf5_path, 'r')

In [None]:
malignant_count = 0
for i in tqdm(range(len(isic_metadata))):
    if isic_metadata.iloc[i]['benign_malignant'] == 'malignant': # skip non-malignant images
        image_id = isic_metadata.iloc[i]['isic_id']
        image = isic_hdf5[image_id][()]
        image = np.frombuffer(image, dtype=np.uint8)
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        image = cv2.resize(image, (128, 128))
        image = image / 255
        malignant_count += 1
        
        images.append(image)
        labels.append(1)
print(f"malignant count: {malignant_count}")
isic_hdf5.close()

In [None]:
malignant_count = len(labels)
print(f"malignant count: {malignant_count}")

### Lastly, load the same amount of beign data from original database

In [None]:
idx = 0
benign_loaded = 0

pbar = tqdm(total=malignant_count, desc="Loading benign images")
while benign_loaded < malignant_count:
    if original_train_metadata.iloc[idx]['target'] == 0:
        image_id = original_train_metadata.iloc[idx]['isic_id']
        image = original_train_hdf5[image_id][()]
        image = np.frombuffer(image, dtype=np.uint8)
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        image = cv2.resize(image, (128, 128))
        image = image / 255
        images.append(image)
        labels.append(0)
        benign_loaded += 1
        pbar.update(1)
    idx += 1
pbar.close()

original_train_hdf5.close()
print(f"benign count: {benign_loaded}")

In [None]:
print(len(images))

### Generate Dataset

In [None]:
from dataset import HDF5Dataset
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(images, labels, test_size=0.2, stratify=labels, random_state=42)


## Load CNN models

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

### DenseNet

In [None]:
from torchvision.models import densenet121, DenseNet121_Weights
from ModelTrainer import Trainer
densenet_weights = DenseNet121_Weights.DEFAULT
densenet_transform = densenet_weights.transforms()
densenet_train_dataset = HDF5Dataset(X_train, y_train, augment=True, transform=densenet_transform)
densenet_val_dataset = HDF5Dataset(X_val, y_val, augment=False, transform=densenet_transform)
densenet_model = densenet121(weights=densenet_weights)
lr = 1e-5
num_epochs = 20
dense_net_trainer = Trainer(device, densenet_train_dataset, densenet_val_dataset, "DenseNet121", densenet_weights, densenet_transform, densenet_model, lr, num_epochs)
dense_net_trainer.train()