# Check Fitz Overlap with SkinCAP

This Jupyter Notebook is for checking the overlap between the Fitzpatrick images we have and the SkinCAP Fitzpatrick images we have.

In [20]:
from google.cloud import storage
import pandas as pd
from io import StringIO, BytesIO

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# load SkinCAP images
client = storage.Client()
bucket = client.bucket('derma-datasets-2')

def load_csv(path):
    blob = bucket.blob(path)
    data = blob.download_as_text()
    csv = pd.read_csv(StringIO(data))
    return csv

# skincap_csv_blob = bucket.blob('raw/SkinCAP/skincap_v240623.csv')
# skincap_csv_data = skincap_csv_blob.download_as_text()
# skincap_meta = pd.read_csv(StringIO(skincap_csv_data))

skincap_meta = load_csv('raw/SkinCAP/skincap_v240623.csv')

In [22]:
skincap_meta = skincap_meta[["id", "skincap_file_path", "ori_file_path", "source"]]
skincap_meta.head()

Unnamed: 0,id,skincap_file_path,ori_file_path,source
0,1,1.png,000002.png,ddi
1,2,2.png,000003.png,ddi
2,3,3.png,000004.png,ddi
3,4,4.png,000005.png,ddi
4,5,5.png,000006.png,ddi


In [3]:
skincap_meta['source'].isna().any()

np.False_

In [4]:
skincap_meta_fitz_flg = skincap_meta["source"] == "fitzpatrick17k"
skincap_meta_fitz = skincap_meta[skincap_meta_fitz_flg]

In [5]:
skincap_meta_fitz.head()

Unnamed: 0,id,skincap_file_path,ori_file_path,source
655,656,656.png,fa2911a9b13b6f8af79cb700937cc14f.jpg,fitzpatrick17k
656,657,657.png,e702b1a7dc40aa1d8e85ccdb019c4ab2.jpg,fitzpatrick17k
657,658,658.png,8438db40abd1eccfbc7ee4b469f1b6f1.jpg,fitzpatrick17k
658,659,659.png,9a3af1bc39e115bcc6931170cf8a00bb.jpg,fitzpatrick17k
659,660,660.png,59ccb668671950ca657b6bc48213b763.jpg,fitzpatrick17k


In [6]:
# filter SkinCAP data to only consider Fitzpatrick images not in final Fitzpatrick dataset (i.e. the image file is not in the raw Fitzpatrick bucket)

def list_files(prefix, get_full_path=False):
    if not prefix.endswith("/"): prefix = prefix + "/"
    
    blobs = bucket.list_blobs(prefix=prefix)
    filepaths = []
    for blob in blobs:
        filepaths.append(blob.name)

    if get_full_path:
        return filepaths

    filenames = [path.replace(prefix, "") for path in filepaths]
    return filenames

fitz_img_filenames = list_files("raw/fitzpatrick17k/images")
fitz_img_filenames[0:5]

['000e8dd5ee75dd6668e978e7a4e6fe54.jpg',
 '001b1edd9c7a02e9381793f6e9081d20.jpg',
 '002f4ec0b620339d58ccfa26c4268584.jpg',
 '005c26fd71696854f7b4c9d80a5dc9d6.jpg',
 '0066264501f73b1db09b3e2aa4dac0a0.jpg']

In [7]:
# filter SkinCAP dataset to only include FitzPatrick images that we have not downloaded
not_in_fitz_flg = ~skincap_meta_fitz['ori_file_path'].isin(fitz_img_filenames)
skincap_dont_have = skincap_meta_fitz[not_in_fitz_flg]
skincap_dont_have.shape

(2811, 4)

In [8]:
skincap_dont_have.head()

Unnamed: 0,id,skincap_file_path,ori_file_path,source
655,656,656.png,fa2911a9b13b6f8af79cb700937cc14f.jpg,fitzpatrick17k
656,657,657.png,e702b1a7dc40aa1d8e85ccdb019c4ab2.jpg,fitzpatrick17k
657,658,658.png,8438db40abd1eccfbc7ee4b469f1b6f1.jpg,fitzpatrick17k
658,659,659.png,9a3af1bc39e115bcc6931170cf8a00bb.jpg,fitzpatrick17k
659,660,660.png,59ccb668671950ca657b6bc48213b763.jpg,fitzpatrick17k


In [9]:
# of these, which do we have in the SkinCAP dataset?

# get list of all the files we have in SkinCAP
sc_img_filenames = list_files("raw/SkinCAP/skincap/")
sc_img_filenames = [name for name in sc_img_filenames if name.endswith("png")]
print(len(sc_img_filenames))

sc_img_filenames += [name for name in list_files("raw/SkinCAP/skincap/not_include") if name.endswith("png")]
sc_img_filenames = list(set(sc_img_filenames)) # drop duplicates
print(len(sc_img_filenames))

4346
4692


In [10]:
# check to make sure we actually have SkinCAP files for all the files in skincap_dont_have
not_in_either_flg = ~skincap_dont_have["skincap_file_path"].isin(sc_img_filenames)
skincap_dont_have[not_in_either_flg]

Unnamed: 0,id,skincap_file_path,ori_file_path,source


Of the images we have in SkinCAP, abotu 2811 come from the Fitzpatrick, but we were not able to download them. This only includes the files in the main `skincap` folder. Let's see if any of the files in the `not_include` folder are also part of Fitzpatrick. We can do that by taking the files, computing the hash, and checking if it is in the full Fitzpatrick dataset.

In [13]:
import hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed

def compute_md5_for_blob(blob):
    md5 = hashlib.md5()
    with blob.open("rb") as f:
        while True:
            data = f.read(8192)
            if not data:
                break
            md5.update(data)
    return blob.name, md5.hexdigest()

def compute_md5_gcs_folder_parallel(bucket_name, prefix, max_workers=16):
    blobs = [
        blob for blob in client.list_blobs(bucket_name, prefix=prefix)
        if blob.name.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff'))
    ]
    hashes = {}

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_blob = {executor.submit(compute_md5_for_blob, blob): blob for blob in blobs}
        for future in as_completed(future_to_blob):
            name, md5sum = future.result()
            hashes[name] = md5sum

    return hashes

# Usage:
bucket_name = "derma-datasets-2"
prefix      = "raw/SkinCAP/skincap/not_include/"
md5s = compute_md5_gcs_folder_parallel(bucket_name, prefix)
# for name, md5sum in md5s.items():
#     print(f"{name}: {md5sum}")

not_include_hashes = pd.DataFrame(list(md5s.items()), columns=["file_name", "md5_hash"])

In [14]:
not_include_hashes.head()

Unnamed: 0,file_name,md5_hash
0,raw/SkinCAP/skincap/not_include/4009.png,5ca94c02d89651613468462514c5d23b
1,raw/SkinCAP/skincap/not_include/4007.png,06967f5227a4227896b2b7c5e6901652
2,raw/SkinCAP/skincap/not_include/4002.png,60abb67a831bace6b777a6f4d82b7032
3,raw/SkinCAP/skincap/not_include/4003.png,c94e4af9862e5883872c85f4912433bd
4,raw/SkinCAP/skincap/not_include/4006.png,40c298a68522a302b8a22ebf3395194e


In [17]:
fitz_meta = load_csv('raw/fitzpatrick17k/fitzpatrick17k.csv')
fitz_meta.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum
0,5e82a45bc5d78bd24ae9202d194423f8,3,3,drug induced pigmentary changes,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicmminoc...
1,fa2911a9b13b6f8af79cb700937cc14f,1,1,photodermatoses,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicpphoto...
2,d2bac3c9e4499032ca8e9b07c7d3bc40,2,3,dermatofibroma,benign dermal,benign,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicdderma...
3,0a94359e7eaacd7178e06b2823777789,1,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...
4,a39ec3b1f22c08a421fa20535e037bba,1,1,psoriasis,inflammatory,non-neoplastic,,https://www.dermaamin.com/site/images/clinical...,httpwwwdermaamincomsiteimagesclinicalpicppsori...


In [19]:
fitz_meta_we_have = fitz_meta.merge(not_include_hashes, how="left", left_on="md5hash", right_on="md5_hash")
fitz_meta_we_have[fitz_meta_we_have["file_name"].notna()]

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,url,url_alphanum,file_name,md5_hash
15053,50bb16c1bfa029e80a55ce64149f3338,5,4,mycosis fungoides,malignant cutaneous lymphoma,malignant,,http://atlasdermatologico.com.br/img?imageId=8701,httpwwwatlasdermatologicocombrimgimageId8701.jpg,raw/SkinCAP/skincap/not_include/4001.png,50bb16c1bfa029e80a55ce64149f3338
15055,60abb67a831bace6b777a6f4d82b7032,3,3,rhinophyma,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=6482,httpwwwatlasdermatologicocombrimgimageId6482.jpg,raw/SkinCAP/skincap/not_include/4002.png,60abb67a831bace6b777a6f4d82b7032
15056,c94e4af9862e5883872c85f4912433bd,3,1,pityriasis rosea,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=5599,httpwwwatlasdermatologicocombrimgimageId5599.jpg,raw/SkinCAP/skincap/not_include/4003.png,c94e4af9862e5883872c85f4912433bd
15058,331c09538efd0f63138152a5de9c1bbc,4,4,fixed eruptions,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=2080,httpwwwatlasdermatologicocombrimgimageId2080.jpg,raw/SkinCAP/skincap/not_include/4004.png,331c09538efd0f63138152a5de9c1bbc
15059,20851c52ed096af52a69bb5a2670c505,2,2,squamous cell carcinoma,malignant epidermal,malignant,,http://atlasdermatologico.com.br/img?imageId=6838,httpwwwatlasdermatologicocombrimgimageId6838.jpg,raw/SkinCAP/skincap/not_include/4005.png,20851c52ed096af52a69bb5a2670c505
15061,40c298a68522a302b8a22ebf3395194e,4,4,psoriasis,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=6265,httpwwwatlasdermatologicocombrimgimageId6265.jpg,raw/SkinCAP/skincap/not_include/4006.png,40c298a68522a302b8a22ebf3395194e
15067,06967f5227a4227896b2b7c5e6901652,3,3,photodermatoses,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=5430,httpwwwatlasdermatologicocombrimgimageId5430.jpg,raw/SkinCAP/skincap/not_include/4007.png,06967f5227a4227896b2b7c5e6901652
15076,c3994c3b7fc8f7f537ce70972618c80d,2,-1,squamous cell carcinoma,malignant epidermal,malignant,,http://atlasdermatologico.com.br/img?imageId=6772,httpwwwatlasdermatologicocombrimgimageId6772.jpg,raw/SkinCAP/skincap/not_include/4008.png,c3994c3b7fc8f7f537ce70972618c80d
15077,5ca94c02d89651613468462514c5d23b,4,4,porokeratosis actinic,benign epidermal,benign,,http://atlasdermatologico.com.br/img?imageId=5903,httpwwwatlasdermatologicocombrimgimageId5903.jpg,raw/SkinCAP/skincap/not_include/4009.png,5ca94c02d89651613468462514c5d23b
15083,bb79039b52e9e693c47056248372931b,2,2,folliculitis,inflammatory,non-neoplastic,,http://atlasdermatologico.com.br/img?imageId=1691,httpwwwatlasdermatologicocombrimgimageId1691.jpg,raw/SkinCAP/skincap/not_include/4010.png,bb79039b52e9e693c47056248372931b
