# Inspect Bucket

This Jupyter Notebook is for inspecting the bucket, so I don't have to download stuff.

In [1]:
from google.cloud import storage
import io
from PIL import Image
import pandas as pd
from io import StringIO, BytesIO
import requests
import re
import matplotlib.pyplot as plt
import concurrent.futures
from tqdm import tqdm
from urllib.parse import urlparse
import random
import os

pd.options.display.max_columns = None
pd.options.display.max_rows = None


client = storage.Client()
bucket = client.bucket('derma-datasets-2')

def view_image(path, bucket):
    blob = bucket.blob(path)
    img_bytes = blob.download_as_bytes()
    img = Image.open(BytesIO(img_bytes))
    img.show()

## Inspect Images

In [2]:
# check size of current final images, so I know how to standardize them
FINAL_PATH = 'final'
FINAL_IMG_PATH = 'final/imgs'
RAW_PATH = 'raw/fitzpatrick17k/'
blobs = bucket.list_blobs(prefix=FINAL_IMG_PATH)

counter = 0
for blob in blobs:
    if counter >= 10:
        break
    if not blob.name.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif', '.webp', '.tiff')):
        continue  # skip non-image files
    
    image_data = blob.download_as_bytes()
    
    # Open with Pillow
    with Image.open(BytesIO(image_data)) as img:
        width, height = img.size
        print(f"{blob.name}: {width}x{height}")
    counter += 1

final/imgs/SCIN_-1001492676369731180.png: 810x1080
final/imgs/SCIN_-1003800477193786941.png: 810x845
final/imgs/SCIN_-1005922060850163675.png: 811x1080
final/imgs/SCIN_-1009086705875443190.png: 978x565
final/imgs/SCIN_-1009810686742517529.png: 805x275
final/imgs/SCIN_-1010272928122958162.png: 810x1080
final/imgs/SCIN_-1010754336982699838.png: 404x977
final/imgs/SCIN_-101168089788221310.png: 139x234
final/imgs/SCIN_-1013329915718011639.png: 810x1080
final/imgs/SCIN_-1013370714954869987.png: 1080x810


It looks like all the images are different sizes. Very well then. I will also just copy the Fitzpatrick images there, without resizing them.

## Metadata CSV

In [3]:
meta_csv_blob = bucket.blob(os.path.join(FINAL_PATH, 'metadata.csv'))
meta_csv_data = meta_csv_blob.download_as_text()
meta_csv = pd.read_csv(StringIO(meta_csv_data))

In [4]:
meta_csv.head()

Unnamed: 0.1,Unnamed: 0,image_id,unharmonized_label,dataset,confidence,save_name
0,0,-3205742176803893704,"Inflicted skin lesions, Eczema",SCIN,4,SCIN_-3205742176803893704
1,1,-4762289084741430925,"Prurigo nodularis, SCC/SCCIS",SCIN,4,SCIN_-4762289084741430925
2,2,-4027806997035329030,Impetigo,SCIN,2,SCIN_-4027806997035329030
3,3,-3799298995660217860,"Lichen planus/lichenoid eruption, Folliculitis...",SCIN,1,SCIN_-3799298995660217860
4,4,-5881426422999442186,"Lichen planus/lichenoid eruption, Folliculitis...",SCIN,1,SCIN_-5881426422999442186


In [5]:
# check for duplicates in the existing metadata
meta_csv[meta_csv.duplicated(subset=["image_id", "dataset"], keep=False)].sort_values("image_id")

Unnamed: 0.1,Unnamed: 0,image_id,unharmonized_label,dataset,confidence,save_name
289,289,-9111307368692396870,Acne,SCIN,3,SCIN_-9111307368692396870
3775,3775,-9111307368692396870,Acne,SCIN,4,SCIN_-9111307368692396870
3194,3194,-9111307368692396870,Acne,SCIN,5,SCIN_-9111307368692396870
3579,3579,-8707418133635013822,Allergic Contact Dermatitis,SCIN,3,SCIN_-8707418133635013822
5626,5626,-8707418133635013822,Allergic Contact Dermatitis,SCIN,4,SCIN_-8707418133635013822
5105,5105,-8351498747234518768,Perioral Dermatitis,SCIN,3,SCIN_-8351498747234518768
6190,6190,-8351498747234518768,Eczema,SCIN,4,SCIN_-8351498747234518768
2956,2956,-737002406762138892,Herpes Simplex,SCIN,4,SCIN_-737002406762138892
1172,1172,-737002406762138892,Herpes Simplex,SCIN,3,SCIN_-737002406762138892
1938,1938,-25353211408586438,Folliculitis,SCIN,4,SCIN_-25353211408586438


In [6]:
scin_meta_csv_blob = bucket.blob(os.path.join(FINAL_PATH, 'SCIN_text_metadata.csv'))
scin_meta_csv_data = scin_meta_csv_blob.download_as_text()
scin_meta_csv = pd.read_csv(StringIO(scin_meta_csv_data))

In [7]:
scin_meta_csv.head()

Unnamed: 0.1,Unnamed: 0,dermatologist_gradable_for_skin_condition_1,dermatologist_gradable_for_skin_condition_2,dermatologist_gradable_for_skin_condition_3,dermatologist_skin_condition_on_label_name,dermatologist_skin_condition_confidence,weighted_skin_condition_label,dermatologist_gradable_for_fitzpatrick_skin_type_1,dermatologist_gradable_for_fitzpatrick_skin_type_2,dermatologist_gradable_for_fitzpatrick_skin_type_3,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,gradable_for_monk_skin_tone_india,gradable_for_monk_skin_tone_us,monk_skin_tone_label_india,monk_skin_tone_label_us,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,race_ethnicity_american_indian_or_alaska_native,race_ethnicity_asian,race_ethnicity_black_or_african_american,race_ethnicity_hispanic_latino_or_spanish_origin,race_ethnicity_middle_eastern_or_north_african,race_ethnicity_native_hawaiian_or_pacific_islander,race_ethnicity_white,race_ethnicity_other_race,race_ethnicity_prefer_not_to_answer,textures_raised_or_bumpy,textures_flat,textures_rough_or_flaky,textures_fluid_filled,body_parts_head_or_neck,body_parts_arm,body_parts_palm,body_parts_back_of_hand,body_parts_torso_front,body_parts_torso_back,body_parts_genitalia_or_groin,body_parts_buttocks,body_parts_leg,body_parts_foot_top_or_side,body_parts_foot_sole,body_parts_other,condition_symptoms_bothersome_appearance,condition_symptoms_bleeding,condition_symptoms_increasing_size,condition_symptoms_darkening,condition_symptoms_itching,condition_symptoms_burning,condition_symptoms_pain,condition_symptoms_no_relevant_experience,other_symptoms_fever,other_symptoms_chills,other_symptoms_fatigue,other_symptoms_joint_pain,other_symptoms_mouth_sores,other_symptoms_shortness_of_breath,other_symptoms_no_relevant_symptoms,related_category,condition_duration,image_1_path,image_2_path,image_3_path,image_1_shot_type,image_2_shot_type,image_3_shot_type,combined_race,race_ethnicity_two_or_more_after_mitigation,labels,dataset,save_name,confidence,image_id,unharmonized_label
0,0,DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT,,,"['Inflicted skin lesions', 'Eczema', 'Irritant...","[4, 4, 3]","{'Inflicted skin lesions': 0.41, 'Eczema': 0.4...",YES,,,FST2,,,True,True,2.0,1.0,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,,,,,,,YES,,,,,,,,,,,,,YES,,,,YES,YES,,YES,YES,YES,,,,,,,,YES,RASH,ONE_DAY,dataset/images/-3205742176803893704.png,,,CLOSE_UP,,,,,"Inflicted skin lesions, Eczema",SCIN,SCIN_-3205742176803893704,4,-3205742176803893704,"Inflicted skin lesions, Eczema"
1,1,DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT,,,"['Prurigo nodularis', 'Actinic Keratosis', 'SC...","[4, 3, 4]","{'Prurigo nodularis': 0.41, 'SCC/SCCIS': 0.41,...",YES,,,FST1,,,True,True,3.0,3.0,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,dataset/images/-4762289084741430925.png,,,CLOSE_UP,,,,,"Prurigo nodularis, SCC/SCCIS",SCIN,SCIN_-4762289084741430925,4,-4762289084741430925,"Prurigo nodularis, SCC/SCCIS"
2,2,DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT,,,"['Impetigo', 'Herpes Zoster', 'Bullous dermati...","[2, 1, 1]","{'Impetigo': 0.55, 'Herpes Zoster': 0.23, 'Bul...",YES,,,FST4,,,True,True,3.0,4.0,SCIN,1.0.0,2023,AGE_18_TO_29,MALE,NONE_IDENTIFIED,,,,YES,,,,,,,YES,YES,,,YES,,,,,,,,,,,,,YES,,YES,YES,YES,,,,,,,,YES,OTHER_ISSUE_DESCRIPTION,ONE_TO_FOUR_WEEKS,dataset/images/-4027806997035329030.png,,,CLOSE_UP,,,HISPANIC_LATINO_OR_SPANISH_ORIGIN,,Impetigo,SCIN,SCIN_-4027806997035329030,2,-4027806997035329030,Impetigo
3,3,DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT,,,"['Lichen planus/lichenoid eruption', 'Follicul...","[1, 1, 1]","{'Lichen planus/lichenoid eruption': 0.33, 'Fo...",YES,,,FST1,,,True,True,1.0,1.0,SCIN,1.0.0,2023,AGE_40_TO_49,FEMALE,FST3,,,,,,,YES,,,,YES,,,YES,YES,,YES,YES,YES,YES,YES,YES,YES,,,,,,,YES,,,,,,,,,,YES,RASH,ONE_DAY,dataset/images/-3799298995660217860.png,dataset/images/-5881426422999442186.png,dataset/images/5854025080806696361.png,AT_AN_ANGLE,AT_DISTANCE,CLOSE_UP,WHITE,,"Lichen planus/lichenoid eruption, Folliculitis...",SCIN,SCIN_-3799298995660217860,1,-3799298995660217860,"Lichen planus/lichenoid eruption, Folliculitis..."
4,4,DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT,,,"['Lichen planus/lichenoid eruption', 'Follicul...","[1, 1, 1]","{'Lichen planus/lichenoid eruption': 0.33, 'Fo...",YES,,,FST1,,,True,True,1.0,1.0,SCIN,1.0.0,2023,AGE_40_TO_49,FEMALE,FST3,,,,,,,YES,,,,YES,,,YES,YES,,YES,YES,YES,YES,YES,YES,YES,,,,,,,YES,,,,,,,,,,YES,RASH,ONE_DAY,dataset/images/-3799298995660217860.png,dataset/images/-5881426422999442186.png,dataset/images/5854025080806696361.png,AT_AN_ANGLE,AT_DISTANCE,CLOSE_UP,WHITE,,"Lichen planus/lichenoid eruption, Folliculitis...",SCIN,SCIN_-5881426422999442186,1,-5881426422999442186,"Lichen planus/lichenoid eruption, Folliculitis..."


## Inspect Fitzpatrick CSV

In [8]:
fitz_meta_blob = bucket.blob(os.path.join(RAW_PATH, 'fitzpatrick17k.csv'))
fitz_meta_data = fitz_meta_blob.download_as_text()
fitz_meta_csv = pd.read_csv(StringIO(fitz_meta_data))

In [9]:
fitz_meta_csv.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum
0,5e82a45bc5d78bd24ae9202d194423f8,3,3,drug induced pigmentary changes,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmminoc...
1,fa2911a9b13b6f8af79cb700937cc14f,1,1,photodermatoses,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicpphoto...
2,d2bac3c9e4499032ca8e9b07c7d3bc40,2,3,dermatofibroma,benign dermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicdderma...
3,0a94359e7eaacd7178e06b2823777789,1,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...
4,a39ec3b1f22c08a421fa20535e037bba,1,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...


## Convert Fitzpatrick CSV to Format

Convert the Fitzpatrick metadata csv to the proper format and merge with existing metadata file.

In [10]:
# filter raw csv
image_blobs = bucket.list_blobs(prefix="raw/fitzpatrick17k/images")

image_filenames = []
for blob in image_blobs:
    image_filenames.append(blob.name)

# get image hashes we have
image_filenames_ser = pd.Series(image_filenames)
image_hashes = image_filenames_ser.str.replace("raw/fitzpatrick17k/images/", "")
image_hashes = image_hashes.apply(lambda x: re.sub(r'\.[^/.]+$', '', x))

# filter metadata to only include images we have
fitz_csv_filt_tmp = fitz_meta_csv[fitz_meta_csv['md5hash'].isin(image_hashes)]
print(fitz_csv_filt_tmp.shape)
fitz_csv_filt_tmp.head()

(3905, 9)


Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum
12672,119d712798a653799adaf8e5e08ce66e,4,3,hailey hailey disease,genodermatoses,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=2409,httpwwwatlasdermatologicocombrimgimageId2409.jpg
12673,23575178ed245ee186aa5f64c94b48d9,3,3,papilomatosis confluentes and reticulate,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=5060,httpwwwatlasdermatologicocombrimgimageId5060.jpg
12674,8afc38b0f9946c000acd5f36e0c85642,2,5,scabies,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=4898,httpwwwatlasdermatologicocombrimgimageId4898.jpg
12675,89e55bbc7452d465218e6c6723c07ce0,2,3,tuberous sclerosis,genodermatoses,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=7598,httpwwwatlasdermatologicocombrimgimageId7598.jpg
12676,c78bde4a058f64a6d2fe762ae2800176,3,4,keloid,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=2969,httpwwwatlasdermatologicocombrimgimageId2969.jpg


In [11]:
# format data to match what Tanush put in bucket
fitz_final_metadata = pd.DataFrame()
fitz_final_metadata["image_id"] = fitz_csv_filt_tmp["md5hash"]
fitz_final_metadata["unharmonized_label"] = fitz_csv_filt_tmp["label"]
fitz_final_metadata["dataset"] = ["fitzpatrick17k"] * fitz_final_metadata.shape[0]
fitz_final_metadata["save_name"] = fitz_final_metadata["dataset"] + '_' + fitz_final_metadata["image_id"]
fitz_final_metadata["confidence"] = fitz_csv_filt_tmp["qc"]

print(fitz_final_metadata.shape)
fitz_final_metadata.head()

(3905, 5)


Unnamed: 0,image_id,unharmonized_label,dataset,save_name,confidence
12672,119d712798a653799adaf8e5e08ce66e,hailey hailey disease,fitzpatrick17k,fitzpatrick17k_119d712798a653799adaf8e5e08ce66e,
12673,23575178ed245ee186aa5f64c94b48d9,papilomatosis confluentes and reticulate,fitzpatrick17k,fitzpatrick17k_23575178ed245ee186aa5f64c94b48d9,
12674,8afc38b0f9946c000acd5f36e0c85642,scabies,fitzpatrick17k,fitzpatrick17k_8afc38b0f9946c000acd5f36e0c85642,
12675,89e55bbc7452d465218e6c6723c07ce0,tuberous sclerosis,fitzpatrick17k,fitzpatrick17k_89e55bbc7452d465218e6c6723c07ce0,
12676,c78bde4a058f64a6d2fe762ae2800176,keloid,fitzpatrick17k,fitzpatrick17k_c78bde4a058f64a6d2fe762ae2800176,


In [12]:
# concatenate datasets
print(f"Expected total rows after combine: {meta_csv.shape[0] + fitz_final_metadata.shape[0]}")

full_meta = pd.concat([meta_csv, fitz_final_metadata], axis=0, ignore_index=True)
# in case we already uploaded the metadata and are running again, drop duplicates so we don't upload it twice
# each image should have a unique ID and dataset combination, so drop any image with duplicates for these
full_meta = full_meta.drop_duplicates(subset = ["image_id", "dataset"], ignore_index=True)
print(f"Shape of full metadata: {full_meta.shape}")
full_meta.head()

Expected total rows after combine: 10422
Shape of full metadata: (10410, 6)


Unnamed: 0.1,Unnamed: 0,image_id,unharmonized_label,dataset,confidence,save_name
0,0.0,-3205742176803893704,"Inflicted skin lesions, Eczema",SCIN,4,SCIN_-3205742176803893704
1,1.0,-4762289084741430925,"Prurigo nodularis, SCC/SCCIS",SCIN,4,SCIN_-4762289084741430925
2,2.0,-4027806997035329030,Impetigo,SCIN,2,SCIN_-4027806997035329030
3,3.0,-3799298995660217860,"Lichen planus/lichenoid eruption, Folliculitis...",SCIN,1,SCIN_-3799298995660217860
4,4.0,-5881426422999442186,"Lichen planus/lichenoid eruption, Folliculitis...",SCIN,1,SCIN_-5881426422999442186


In [13]:
# save to bucket
# full_meta = full_meta.drop('Unnamed: 0', axis=1)
# full_meta.to_csv("metadata.csv", index=False)

# meta_csv_upload_blob = bucket.blob(os.path.join(FINAL_PATH, 'metadata_tmp.csv'))
# blob.upload_from_filename('metadata.csv')

# full_meta.head()
