In [2]:
import pandas as pd
from PIL import Image
import os
from tqdm import tqdm
from sklearn.metrics import roc_curve, auc, roc_auc_score
import torch
import numpy as np
from torchmetrics.classification import BinaryAUROC
from classification import *
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from libauc.losses import pAUCLoss
from libauc.sampler import DualSampler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
fold = 3
df_data = pd.read_csv("/home/nhattm/ISIC2024/dataset/data_images.csv")
df_train = df_data[df_data["fold"] != fold].reset_index(drop=True)
df_test = df_data[df_data["fold"] == fold].reset_index(drop=True)
transforms = get_transform()

In [3]:
test_dataset = ISIC_Loader(df_test)[11095]

In [4]:
# image, label = test_dataset
# image_aug = transforms(image)
# plt.imshow(image_aug[0].permute(1, 2, 0))

In [5]:
df_data["target"].value_counts()

target
0    50000
1     5499
Name: count, dtype: int64

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fold =1
df_data = pd.read_csv("./dataset/data_images.csv")
# get dataframe train and test
df_train = df_data[df_data["fold"] != fold].reset_index(drop=True)
df_test = df_data[df_data["fold"] == fold].reset_index(drop=True)
train_loader = ISIC_Loader(df_train)
test_loader = ISIC_Loader(df_test)
# Define data loaders for the training and test data


test_dataset = DataLoader(
    test_loader,
    batch_size=cfg.TRAIN.BATCH_SIZE,
    num_workers=cfg.TRAIN.NUM_WORKERS,
    prefetch_factor=cfg.TRAIN.PREFETCH_FACTOR,
        )
model = convnext_small(
    pretrained=cfg.TRAIN.PRETRAIN,
    in_22k=cfg.TRAIN.CONVEXT.IN22K,
    in_chans=cfg.DATA.IN_CHANNEL,
    num_classes=cfg.DATA.NUM_CLASS,
    drop_path_rate=cfg.TRAIN.CONVEXT.DROPOUT,
)
classifier = Classifier(
    model,
    cfg.DATA.CLASS_WEIGHT,
    cfg.DATA.NUM_CLASS,
    cfg.OPT.LEARNING_RATE,
    cfg.OPT.FACTOR_LR,
    cfg.OPT.PATIENCE_LR,
)

In [21]:
sampler = DualSampler(train_loader, batch_size=cfg.TRAIN.BATCH_SIZE, sampling_rate=0.1)
train_dataset = DataLoader(
    train_loader,
    batch_size=cfg.TRAIN.BATCH_SIZE,
    pin_memory=True,
    shuffle=False,
    num_workers=cfg.TRAIN.NUM_WORKERS,
    drop_last=True,
    prefetch_factor=cfg.TRAIN.PREFETCH_FACTOR,
    sampler=sampler,
)

In [18]:
df_data = pd.read_csv("./dataset/data_images.csv")
# get dataframe train and test
df_train = df_data[df_data["fold"] != fold].reset_index(drop=True)
df_test = df_data[df_data["fold"] == fold].reset_index(drop=True)
# duplicate df_Train to df_train_aug that all columns have target==1 will be duplicated 10 times
df_train_aug = df_train[df_train["target"] == 1].copy()
df_train_aug = pd.concat([df_train_aug] * 8, ignore_index=True)
df_train = pd.concat([df_train, df_train_aug], ignore_index=True)

In [32]:
import pandas as pd
import os
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold

# add path to this file
import sys

# sys.path.append(os.path.dirname(os.path.abspath(__file__)))

# create a df for 3 datasets but get the positive samples from 2019 and 2020 only
# load the data
df_train_2019a = pd.read_csv("./dataset/data2019/ISIC_2019_Training_Metadata.csv")
df_train_2019b = pd.read_csv("./dataset/data2019/train-groundtruth.csv")
# concatenate the two dataframes, cat 2019a with 2019b
df_train_2019 = pd.concat([df_train_2019a, df_train_2019b], axis=1)
# remove columns :lesion_id,Unnamed: 0
df_train_2019 = df_train_2019.drop(columns=["Unnamed: 0", "isic_id", "lesion_id"], axis=1)
df_train_2019 = df_train_2019.rename(columns={"image": "isic_id"})

# get the positive samples from 2019
df_train_2019_positives = df_train_2019[df_train_2019["target"] == 1].copy()
# loop through the dataframe and add the address of the image to the dataframe
for isic_id in tqdm(df_train_2019_positives["isic_id"]):
    image_path1 = f"./dataset/data2019/image/{isic_id}.jpg"
    image_path2 = f"./dataset/data2019/image/{isic_id}_downsampled.jpg"

    if os.path.exists(image_path1):
        df_train_2019_positives.loc[df_train_2019_positives["isic_id"] == isic_id, "image_path"] = image_path1
    elif os.path.exists(image_path2):
        df_train_2019_positives.loc[df_train_2019_positives["isic_id"] == isic_id, "image_path"] = image_path2
    else:
        print(f"Image {isic_id} not found")
        break

df_train_2020 = pd.read_csv("./dataset/data2020/train.csv")
# drop the diagnosis, benign_malignant columns
df_train_2020 = df_train_2020.drop(columns=["diagnosis", "benign_malignant"], axis=1)
# change name of the columns 2020 to the same 2019:
# image_name -> image, anatom_site_general_challenge -> anatom_site_general
df_train_2020 = df_train_2020.rename(
    columns={"image_name": "isic_id", "anatom_site_general_challenge": "anatom_site_general"}
)

# get the positive samples from 2020
df_train_2020_positives = df_train_2020[df_train_2020["target"] == 1].copy()

for isic_id in tqdm(df_train_2020_positives["isic_id"]):
    image_path = f"./dataset/data2020/image/{isic_id}.jpg"

    if os.path.exists(image_path):
        df_train_2020_positives.loc[df_train_2020_positives["isic_id"] == isic_id, "image_path"] = image_path
    else:
        print(f"Image {isic_id} not found")
        break

# load data 2024
df_train_2024 = pd.read_csv("./dataset/data2024/train-metadata.csv")
df_test_2024 = pd.read_csv("./dataset/data2024/test-metadata.csv")
# remove columns in df 2024 train if it not in df 2024 test
remove_columns = [col for col in df_train_2024.columns if col not in df_test_2024.columns]
remove_columns.remove("target")
df_train_2024 = df_train_2024.drop(columns=remove_columns, axis=1)
# get all images with target == 1 and 50000 images with target == 0
# to balance the dataset
df_train_2024 = df_train_2024.sort_values(by="target", ascending=False)

df_train_2024_positives = df_train_2024[df_train_2024["target"] == 1].copy()
for isic_id in tqdm(df_train_2024_positives["isic_id"]):
    image_path = f"./dataset/data2024/image/{isic_id}.jpg"

    if os.path.exists(image_path):
        df_train_2024_positives.loc[df_train_2024_positives["isic_id"] == isic_id, "image_path"] = image_path
    else:
        print(f"Image {isic_id} not found")
        break

df_train_positives = pd.concat([df_train_2019_positives, df_train_2020_positives, df_train_2024_positives], axis=0)
df_train_positives = df_train_positives.reset_index(drop=True)
# spilt df_train_positives into 5 folds, each fold has the same number of positive samples
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
df_train_positives = df_train_positives[["isic_id", "target", "image_path"]]
df_train_positives["fold"] = -1
for fold, (train_index, val_index) in enumerate(skf.split(df_train_positives, df_train_positives["target"])):
    df_train_positives.loc[val_index, "fold"] = fold + 1

# get the negative samples from 2024
df_train_2024 = df_train_2024[["isic_id", "target"]]
df_train_2024_negatives = df_train_2024[df_train_2024["target"] == 0].copy()
# for training each fold, we use 44000 negative samples for train and 5500 negative samples for validation from df_train_2024_negatives
for fold in range(1, 6):
    df_negatives = df_train_2024_negatives[55000 * (fold - 1) : 55000 * fold].copy().reset_index(drop=True)
    # get 11000 negative samples for validation so change the "fold" column to -1
    df_negatives.loc[44000:, "fold"] = fold
    df_negatives.loc[:44000, "fold"] = -1
    for isic_id in tqdm(df_negatives["isic_id"]):
        image_path = f"./dataset/data2024/image/{isic_id}.jpg"

        if os.path.exists(image_path):
            df_negatives.loc[df_negatives["isic_id"] == isic_id, "image_path"] = image_path
        else:
            print(f"Image {isic_id} not found")
            break
    # concatenate the negative samples with the positive samples
    df_train_fold = pd.concat([df_negatives, df_train_positives], axis=0).reset_index(drop=True)
    # save the dataframe to csv file
    df_train_fold.to_csv(f"./dataset/data_images_fold{fold}.csv", index=False)

  0%|          | 0/4522 [00:00<?, ?it/s]

100%|██████████| 4522/4522 [00:01<00:00, 4395.16it/s]
100%|██████████| 584/584 [00:00<00:00, 8122.03it/s]
  df_train_2024 = pd.read_csv("./dataset/data2024/train-metadata.csv")
100%|██████████| 393/393 [00:00<00:00, 8652.95it/s]
100%|██████████| 55000/55000 [01:23<00:00, 656.13it/s]
100%|██████████| 55000/55000 [01:23<00:00, 658.24it/s]
100%|██████████| 55000/55000 [01:23<00:00, 659.63it/s]
100%|██████████| 55000/55000 [01:23<00:00, 658.99it/s]
100%|██████████| 55000/55000 [01:23<00:00, 656.90it/s]


In [8]:
df_train_fold = pd.read_csv("./dataset/data_images_fold2.csv")

In [9]:
df_train_fold.target.value_counts()

target
0    55000
1     5499
Name: count, dtype: int64

In [10]:
df_train_positives["target"].value_counts()

NameError: name 'df_train_positives' is not defined