In [24]:
#!/usr/bin/env python
# coding: utf-8

In [25]:
import pandas as pd
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
from tqdm import tqdm
import matplotlib.pyplot as plt


# record start time
_START_RUNTIME = time.time()

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

# Define data and weight path
DATA_PATH = "../data/raw/fitzpatrick17k.csv"

## About Raw Data
https://github.com/mattgroh/fitzpatrick17k/blob/main/fitzpatrick17k.csv

In [26]:
# Analysis of the Fitzpatrick 17k dataset

df = pd.read_csv(DATA_PATH)

print("Columns in the dataset:", df.columns)
print("Total samples:", len(df))
print("\nFitzpatrick scale distribution:")
print(df['fitzpatrick_scale'].value_counts().sort_index())
print("\nNumber of unique conditions:", df['label'].nunique())
print("\nTop 10 conditions:")
print(df['label'].value_counts().head(10))

Columns in the dataset: Index(['md5hash', 'fitzpatrick_scale', 'fitzpatrick_centaur', 'label',
       'nine_partition_label', 'three_partition_label', 'qc', 'url',
       'url_alphanum'],
      dtype='object')
Total samples: 16577

Fitzpatrick scale distribution:
fitzpatrick_scale
-1     565
 1    2947
 2    4808
 3    3308
 4    2781
 5    1533
 6     635
Name: count, dtype: int64

Number of unique conditions: 114

Top 10 conditions:
label
psoriasis                      653
squamous cell carcinoma        581
lichen planus                  491
basal cell carcinoma           468
allergic contact dermatitis    430
lupus erythematosus            410
neutrophilic dermatoses        361
sarcoidosis                    349
photodermatoses                348
folliculitis                   342
Name: count, dtype: int64


In [27]:
def analyze_distribution(df):
    """Analyze distribution of skin types and conditions"""
    
    print("\nSkin Type Distribution:")
    skin_type_dist = df['fitzpatrick_scale'].value_counts().sort_index()
    print(skin_type_dist)
    
    print("\nCondition Distribution:")
    condition_dist = df['three_partition_label'].value_counts()
    print(condition_dist)
    
    # Visualize distributions
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    skin_type_dist.plot(kind='bar')
    plt.title('Distribution of Skin Types')
    plt.xlabel('Fitzpatrick Scale')
    plt.ylabel('Count')
    
    plt.subplot(1, 2, 2)
    condition_dist.plot(kind='bar')
    plt.title('Distribution of Conditions')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return skin_type_dist, condition_dist

# Process the data and organize data into right folders

In [28]:
import sys
sys.path.append('../src/data')
from preprocess import download_dataset_images, update_df_with_image_paths, create_data_splits

def data_preprocess(df, base_dir='../data/processed/images'):
    """
    Process the dataset by downloading images and organizing them into train/val/test splits
    
    Args:
        df: DataFrame with the dataset information
        base_dir: Base directory for storing images
        
    Returns:
        train_df, val_df, test_df: Split DataFrames
    """
    # Create base directory if it doesn't exist
    os.makedirs(base_dir, exist_ok=True)
    
    # Step 1: Download images
    print("Step 1: Downloading images...")
    successful_downloads = download_dataset_images(df, base_dir)
    print(f"Successfully downloaded {len(successful_downloads)} images")
    
    # Step 2: Update DataFrame with image paths
    print("\nStep 2: Updating DataFrame with image paths...")
    df_with_images = update_df_with_image_paths(df, base_dir)
    print(f"Total samples with images: {len(df_with_images)}")
    
    # Step 3: Create data splits and organize images
    print("\nStep 3: Creating data splits and organizing images...")
    train_df, val_df, test_df = create_data_splits(df_with_images, base_dir)
    
    # Print summary
    # if train_df is not None:
    #     print("\nData preparation completed successfully!")
    #     print(f"Training set: {len(train_df)} images")
    #     print(f"Validation set: {len(val_df)} images")
    #     print(f"Test set: {len(test_df)} images")
        
    #     # Print distribution of skin types in each split
    #     print("\nFitzpatrick scale distribution:")
    #     print("Training set:")
    #     print(train_df['fitzpatrick_scale'].value_counts().sort_index())
    #     print("\nValidation set:")
    #     print(val_df['fitzpatrick_scale'].value_counts().sort_index())
    #     print("\nTest set:")
    #     print(test_df['fitzpatrick_scale'].value_counts().sort_index())
        
    #     # Print distribution of classifications in each split
    #     if 'three_partition_label' in train_df.columns:
    #         print("\nClassification distribution:")
    #         print("Training set:")
    #         print(train_df['three_partition_label'].value_counts())
    #         print("\nValidation set:")
    #         print(val_df['three_partition_label'].value_counts())
    #         print("\nTest set:")
    #         print(test_df['three_partition_label'].value_counts())
    
    return train_df, val_df, test_df

data_preprocess(df, base_dir='../data/processed/images')

Step 1: Downloading images...
Downloading images...


  1%|          | 147/16577 [00:28<39:24,  6.95it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


  2%|▏         | 351/16577 [01:07<39:19,  6.88it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


  3%|▎         | 537/16577 [01:43<39:43,  6.73it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


  5%|▌         | 870/16577 [02:48<35:41,  7.33it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


  5%|▌         | 881/16577 [02:50<36:57,  7.08it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


  6%|▌         | 920/16577 [02:57<37:17,  7.00it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


  8%|▊         | 1324/16577 [04:24<36:36,  6.95it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 12%|█▏        | 1960/16577 [06:30<34:12,  7.12it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 12%|█▏        | 1972/16577 [06:32<47:40,  5.11it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 13%|█▎        | 2080/16577 [06:52<35:12,  6.86it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 15%|█▌        | 2521/16577 [08:19<32:09,  7.28it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 22%|██▏       | 3603/16577 [11:47<31:36,  6.84it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 22%|██▏       | 3674/16577 [12:01<34:18,  6.27it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 26%|██▌       | 4267/16577 [13:59<41:30,  4.94it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 27%|██▋       | 4428/16577 [14:30<28:56,  7.00it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 29%|██▉       | 4795/16577 [15:40<36:11,  5.43it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 33%|███▎      | 5412/16577 [17:38<27:37,  6.74it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 33%|███▎      | 5434/16577 [17:42<27:07,  6.85it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 35%|███▍      | 5789/16577 [18:51<26:43,  6.73it/s]  

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 37%|███▋      | 6203/16577 [20:11<27:09,  6.37it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 41%|████      | 6716/16577 [22:02<8:38:18,  3.15s/it]

Error downloading https://www.dermaamin.com/site/images/clinical-pic/r/rhus_dermatitis/rhus_dermatitis30.jpg: HTTPSConnectionPool(host='www.dermaamin.com', port=443): Read timed out. (read timeout=10)


 41%|████      | 6717/16577 [22:12<14:18:36,  5.22s/it]

Error downloading https://www.dermaamin.com/site/images/clinical-pic/p/paronychia/paronychia13.jpg: HTTPSConnectionPool(host='www.dermaamin.com', port=443): Read timed out. (read timeout=10)


 44%|████▍     | 7265/16577 [24:09<34:38,  4.48it/s]   

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 44%|████▍     | 7330/16577 [24:21<22:55,  6.72it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 45%|████▍     | 7446/16577 [24:43<21:40,  7.02it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 46%|████▌     | 7620/16577 [25:17<22:10,  6.73it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 47%|████▋     | 7792/16577 [25:51<21:27,  6.82it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 54%|█████▎    | 8889/16577 [29:17<17:47,  7.20it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 55%|█████▌    | 9128/16577 [30:01<17:52,  6.95it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 58%|█████▊    | 9574/16577 [31:26<17:24,  6.70it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 58%|█████▊    | 9686/16577 [31:47<16:39,  6.90it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 60%|█████▉    | 9933/16577 [32:36<15:49,  7.00it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 60%|██████    | 9967/16577 [32:42<15:43,  7.01it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 64%|██████▍   | 10587/16577 [34:41<13:53,  7.18it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 66%|██████▌   | 10914/16577 [35:43<14:19,  6.59it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 66%|██████▋   | 10985/16577 [35:57<13:41,  6.80it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 67%|██████▋   | 11125/16577 [36:23<12:59,  6.99it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 71%|███████   | 11787/16577 [41:14<14:52,  5.37it/s]   

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 72%|███████▏  | 11913/16577 [41:43<11:46,  6.60it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 73%|███████▎  | 12156/16577 [42:29<14:54,  4.94it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 74%|███████▎  | 12194/16577 [42:36<11:01,  6.62it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 75%|███████▍  | 12425/16577 [43:22<14:57,  4.63it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


 76%|███████▌  | 12537/16577 [43:44<10:07,  6.65it/s]

Error downloading nan: Invalid URL 'nan': No scheme supplied. Perhaps you meant https://nan?


100%|██████████| 16577/16577 [58:25<00:00,  4.73it/s]


Successfully downloaded 3887 images

Step 2: Updating DataFrame with image paths...
Total samples with images: 3887

Step 3: Creating data splits and organizing images...
Creating data splits...
Created directory: ../data/processed/images/train
Created directory: ../data/processed/images/val
Created directory: ../data/processed/images/test
Done Creating directories
Unique classification values: ['non-neoplastic' 'benign' 'malignant']

Stratification column value counts:
strat_col
4_non-neoplastic     803
3_non-neoplastic     679
5_non-neoplastic     521
2_non-neoplastic     418
3_malignant          196
6_non-neoplastic     179
2_malignant          166
4_malignant          158
4_benign             121
3_benign             121
-1_non-neoplastic    113
1_non-neoplastic      91
5_malignant           76
2_benign              74
1_malignant           61
5_benign              39
-1_malignant          28
6_malignant           17
-1_benign             12
1_benign               9
6_benign       

100%|██████████| 2720/2720 [00:01<00:00, 1447.02it/s]


TESTING IF IMAGES ARE MOVED

Organizing images in val folder...
Total images to organize: 583


100%|██████████| 583/583 [00:00<00:00, 1313.54it/s]


TESTING IF IMAGES ARE MOVED

Organizing images in test folder...
Total images to organize: 584


100%|██████████| 584/584 [00:00<00:00, 1466.65it/s]


TESTING IF IMAGES ARE MOVED
Saved train_split.csv with 2720 records
Saved val_split.csv with 583 records
Saved test_split.csv with 584 records

Final directory structure created:
  train/malignant: 6 conditions
  train/benign: 16 conditions
  train/non-neoplastic: 66 conditions
  val/malignant: 6 conditions
  val/benign: 13 conditions
  val/non-neoplastic: 60 conditions
  test/malignant: 6 conditions
  test/benign: 14 conditions
  test/non-neoplastic: 57 conditions


(                                md5hash  fitzpatrick_scale  \
 14878  5b1fd32b7316582bae01110f2282237c                  3   
 14283  89d26d163983669c7e2e5babbeb97bfe                  3   
 13692  156ed125eab624182be3e6304cd5d1ed                  2   
 13363  5a55b6a5368fee12881efc2bfb3ddb29                  3   
 16551  7cd53e07049181c7ba86eb59636e9e42                  3   
 ...                                 ...                ...   
 16295  9095b0fe47a5bca471c27e3b12967d00                  5   
 14508  6096acdcd60616120d5b7f6637892634                  5   
 15952  37d5efb3f304b6bd0472dde58bd78b5a                  4   
 16082  4c731419c06a0d56f81cb03929783d38                 -1   
 16373  d2e55a1959fdb0563226fbada4e405b3                  5   
 
        fitzpatrick_centaur                        label nine_partition_label  \
 14878                    4        necrobiosis lipoidica         inflammatory   
 14283                    4                fordyce spots        benign dermal   

In [29]:
# Add this cell to your notebook to manually clean up the base directory
def manual_cleanup(base_dir='../data/processed/images', train_df=None, val_df=None, test_df=None):
    """
    Manually clean up the base directory by removing images that have been copied to split folders
    """
    print("\nManually cleaning up base directory...")
    
    # Get all image files in the base directory
    base_files = [f for f in os.listdir(base_dir) if f.endswith('.jpg') and os.path.isfile(os.path.join(base_dir, f))]
    print(f"Found {len(base_files)} jpg files in base directory")
    
    # Count files removed
    removed_count = 0
    
    # Remove each file
    for filename in base_files:
        file_path = os.path.join(base_dir, filename)
        try:
            os.remove(file_path)
            removed_count += 1
        except Exception as e:
            print(f"Error removing {filename}: {str(e)}")
    
    print(f"Removed {removed_count} original images from base directory")

# Call the manual cleanup function
manual_cleanup()



Manually cleaning up base directory...
Found 3887 jpg files in base directory
Removed 3887 original images from base directory
