In [None]:
!pip install matplotlib google-cloud-storage



In [None]:
import collections
import copy
import hashlib
import io
import os
import subprocess
import textwrap
import time
import glob

from typing import List, Text

from PIL import Image

import cv2
import numpy as np
import pandas as pd
import tabulate
import seaborn as sns
from tqdm import tqdm


import tensorflow as tf

import matplotlib.pyplot as plt

In [None]:
from google.colab import auth

# Authenticate user for access. There will be a popup asking you to sign in with your user and approve access.
auth.authenticate_user()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Global params

class Globals:
  # GCP project with GCS bucket of interest
  gcp_project = 'dx-scin-public' #@param

  # GCS bucket with data to read
  gcs_bucket_name = 'dx-scin-public-data' #@param

  # CSV of case metadata to read
  cases_csv = 'dataset/scin_cases.csv' #@param

  # CSV of label metadata to read
  labels_csv = 'dataset/scin_labels.csv' #@param

  # Images directory
  gcs_images_dir = 'dataset/images/' #@param

  ### Key column names
  image_path_columns = ['image_1_path', 'image_2_path', 'image_3_path']
  weighted_skin_condition_label = "weighted_skin_condition_label"
  skin_condition_label = "dermatologist_skin_condition_on_label_name"

  ###### Formed during execution:

  # Client for querying GCS
  gcs_storage_client = None

  # Bucket object for loading files
  gcs_bucket = None

  # pd.DataFrame for the loaded metadata_csv
  cases_df = None

  # pd.DataFrame for the loaded labels_csv
  cases_and_labels_df = None

print(f'GCS bucket name: {Globals.gcs_bucket_name}')
print(f'cases_csv: {Globals.cases_csv}')
print(f'labels_csv: {Globals.labels_csv}')
print(f'images dir: {Globals.gcs_images_dir}')

GCS bucket name: dx-scin-public-data
cases_csv: dataset/scin_cases.csv
labels_csv: dataset/scin_labels.csv
images dir: dataset/images/


## Load dataset

In [None]:
#@title Create a dataframe that contains the metadata and condition labels

from google.cloud import storage

def list_blobs(storage_client, bucket_name):
  """Helper to list blobs in a bucket (useful for debugging)."""
  blobs = storage_client.list_blobs(bucket_name)
  for blob in blobs:
    print(blob)

def initialize_df_with_metadata(bucket, csv_path):
  """Loads the given CSV into a pd.DataFrame."""
  df = pd.read_csv(io.BytesIO(bucket.blob(csv_path).download_as_string()), dtype={'case_id': str})
  df['case_id'] = df['case_id'].astype(str)
  return df

def augment_metadata_with_labels(df, bucket, csv_path):
  """Loads the given CSV into a pd.DataFrame."""
  labels_df = pd.read_csv(io.BytesIO(bucket.blob(csv_path).download_as_string()), dtype={'case_id': str})
  labels_df['case_id'] = labels_df['case_id'].astype(str)
  merged_df = pd.merge(df, labels_df, on='case_id')
  return merged_df

Globals.gcs_storage_client = storage.Client(Globals.gcp_project)
Globals.gcs_bucket = Globals.gcs_storage_client.bucket(
    Globals.gcs_bucket_name
)
Globals.cases_df = initialize_df_with_metadata(Globals.gcs_bucket, Globals.cases_csv)
Globals.cases_and_labels_df = augment_metadata_with_labels(Globals.cases_df, Globals.gcs_bucket, Globals.labels_csv)
print(len(Globals.cases_and_labels_df))

5033


In [None]:
Globals.cases_and_labels_df.columns

Index(['case_id', 'source', 'release', 'year', 'age_group', 'sex_at_birth',
       'fitzpatrick_skin_type',
       'race_ethnicity_american_indian_or_alaska_native',
       'race_ethnicity_asian', 'race_ethnicity_black_or_african_american',
       'race_ethnicity_hispanic_latino_or_spanish_origin',
       'race_ethnicity_middle_eastern_or_north_african',
       'race_ethnicity_native_hawaiian_or_pacific_islander',
       'race_ethnicity_white', 'race_ethnicity_other_race',
       'race_ethnicity_prefer_not_to_answer', 'textures_raised_or_bumpy',
       'textures_flat', 'textures_rough_or_flaky', 'textures_fluid_filled',
       'body_parts_head_or_neck', 'body_parts_arm', 'body_parts_palm',
       'body_parts_back_of_hand', 'body_parts_torso_front',
       'body_parts_torso_back', 'body_parts_genitalia_or_groin',
       'body_parts_buttocks', 'body_parts_leg', 'body_parts_foot_top_or_side',
       'body_parts_foot_sole', 'body_parts_other',
       'condition_symptoms_bothersome_appearan

In [None]:
Globals.cases_and_labels_df.sample(1)

Unnamed: 0,case_id,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,race_ethnicity_american_indian_or_alaska_native,race_ethnicity_asian,race_ethnicity_black_or_african_american,...,dermatologist_gradable_for_fitzpatrick_skin_type_1,dermatologist_gradable_for_fitzpatrick_skin_type_2,dermatologist_gradable_for_fitzpatrick_skin_type_3,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,gradable_for_monk_skin_tone_india,gradable_for_monk_skin_tone_us,monk_skin_tone_label_india,monk_skin_tone_label_us
4372,7183833784168716616,SCIN,1.0.0,2023,AGE_50_TO_59,FEMALE,,,,,...,YES,,,FST3,,,True,True,2.0,2.0


## Important Functions

**Variables**
* df_original: The full, unmodified dataset containing all cases, metadata, and image paths. This is equivalent to Globals.cases_and_labels_df and can be used like a normal pandas DataFrame.
* df_filtered: A working copy of the dataset that you can safely modify, filter, or clean without affecting the original.
* image_dir : Use this directory to access images within google cloud


**Functions**
* read_image_from_gcs(gcs_path)
  - Downloads and decodes an image directly from your GCS bucket using the path stored in the dataset (e.g. "dataset/images/12345.png").
* get_all_image_paths()
  - Extracts all unique image paths from the three image columns (image_1_path, image_2_path, image_3_path) in the dataset.
* show_case_images(case_id)
  - Displays all available images for a given case_id directly from GCS.
* convert_to_binary_var(col_name)
  - convert to binary values

  Note: add instructions if we want to analyze/change/decode all images together

In [None]:
df_original = Globals.cases_and_labels_df

In [None]:
df_filtered = df_original.copy()

In [None]:
image_dir = "/content/gcs_mount/dataset/images"

In [None]:
def read_image_from_gcs(gcs_path):
    """Reads and decodes an image from GCS (relative path)."""
    try:
        blob = Globals.gcs_bucket.blob(gcs_path)
        img_bytes = blob.download_as_bytes()
        img_array = np.frombuffer(img_bytes, np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        if img is None:
            return None
        # Convert BGR (OpenCV) â†’ RGB (matplotlib)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img_rgb
    except Exception as e:
        print(f"Failed to read {gcs_path}: {e}")
        return None


In [None]:
def get_all_image_paths():
    """Flatten all image_1/2/3 columns into a unique list of image paths."""
    image_cols = ['image_1_path', 'image_2_path', 'image_3_path']
    all_paths = (
        Globals.cases_and_labels_df[image_cols]
        .stack()
        .dropna()
        .unique()
        .tolist()
    )
    print(f"ðŸ“¸ Found {len(all_paths)} unique image paths.")
    return all_paths


In [None]:
def show_case_images(case_id):
    """Display all available images for a given case_id."""
    row = Globals.cases_and_labels_df[Globals.cases_and_labels_df["case_id"] == str(case_id)]
    if row.empty:
        print(f"No case found for ID {case_id}")
        return

    paths = row[['image_1_path', 'image_2_path', 'image_3_path']].dropna(axis=1).values.flatten()
    plt.figure(figsize=(15, 5))
    for i, path in enumerate(paths, 1):
        img = read_image_from_gcs(path)
        plt.subplot(1, len(paths), i)
        plt.imshow(img)
        plt.title(f"{os.path.basename(path)}", fontsize=9)
        plt.axis("off")
    plt.suptitle(f"Case ID: {case_id}", fontsize=12)
    plt.tight_layout()
    plt.show()


In [None]:
def convert_to_binary_var(col_name):
  df_filtered[col_name].replace('YES', 1, inplace = True)
  df_filtered[col_name].fillna(0, inplace = True)
  print(df_filtered[col_name].value_counts())
  print("")


## Feature Cleaning, Preprocessing, and Exploration

In [None]:
df_filtered['fitzpatrick_skin_type'].fillna('NONE_IDENTIFIED', inplace = True)
print(df_filtered['fitzpatrick_skin_type'].count())
df_filtered['fitzpatrick_skin_type'].value_counts()

5033


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered['fitzpatrick_skin_type'].fillna('NONE_IDENTIFIED', inplace = True)


Unnamed: 0_level_0,count
fitzpatrick_skin_type,Unnamed: 1_level_1
NONE_IDENTIFIED,2858
FST3,661
FST2,542
FST4,427
FST5,214
FST1,188
FST6,143


In [None]:
race_ethnicity_cols_list = ['race_ethnicity_american_indian_or_alaska_native', 'race_ethnicity_asian', 'race_ethnicity_black_or_african_american', 'race_ethnicity_hispanic_latino_or_spanish_origin', 'race_ethnicity_middle_eastern_or_north_african', 'race_ethnicity_native_hawaiian_or_pacific_islander', 'race_ethnicity_white', 'race_ethnicity_other_race', 'race_ethnicity_prefer_not_to_answer', 'race_ethnicity_two_or_more_after_mitigation']

for col in race_ethnicity_cols_list:
  convert_to_binary_var(col)

race_ethnicity_american_indian_or_alaska_native
0.0    4960
1.0      73
Name: count, dtype: int64

race_ethnicity_asian
0.0    4937
1.0      96
Name: count, dtype: int64

race_ethnicity_black_or_african_american
0.0    4735
1.0     298
Name: count, dtype: int64

race_ethnicity_hispanic_latino_or_spanish_origin
0.0    4743
1.0     290
Name: count, dtype: int64

race_ethnicity_middle_eastern_or_north_african
0.0    5026
1.0       7
Name: count, dtype: int64

race_ethnicity_native_hawaiian_or_pacific_islander
0.0    5029
1.0       4
Name: count, dtype: int64

race_ethnicity_white
0.0    3160
1.0    1873
Name: count, dtype: int64

race_ethnicity_other_race
0.0    5017
1.0      16
Name: count, dtype: int64

race_ethnicity_prefer_not_to_answer
0.0    4999
1.0      34
Name: count, dtype: int64

race_ethnicity_two_or_more_after_mitigation
0.0    4950
1.0      83
Name: count, dtype: int64



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered[col_name].replace('YES', 1, inplace = True)
  df_filtered[col_name].replace('YES', 1, inplace = True)


In [None]:
texture_cols_list = ['textures_raised_or_bumpy', 'textures_flat', 'textures_rough_or_flaky', 'textures_fluid_filled']

for col in texture_cols_list:
  convert_to_binary_var(col)

textures_raised_or_bumpy
1.0    2915
0.0    2118
Name: count, dtype: int64

textures_flat
0.0    4211
1.0     822
Name: count, dtype: int64

textures_rough_or_flaky
0.0    4001
1.0    1032
Name: count, dtype: int64

textures_fluid_filled
0.0    4404
1.0     629
Name: count, dtype: int64



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered[col_name].replace('YES', 1, inplace = True)
  df_filtered[col_name].replace('YES', 1, inplace = True)


In [None]:
body_parts_col_start = df_original.columns.get_loc('body_parts_head_or_neck')
body_parts_col_end = df_original.columns.get_loc('body_parts_other')
body_parts_cols_list = df_filtered.iloc[:, body_parts_col_start:body_parts_col_end + 1 ]

for col in body_parts_cols_list:
  convert_to_binary_var(col)

body_parts_head_or_neck
0.0    4296
1.0     737
Name: count, dtype: int64

body_parts_arm
0.0    3326
1.0    1707
Name: count, dtype: int64

body_parts_palm
0.0    4818
1.0     215
Name: count, dtype: int64

body_parts_back_of_hand
0.0    4512
1.0     521
Name: count, dtype: int64

body_parts_torso_front
0.0    4238
1.0     795
Name: count, dtype: int64

body_parts_torso_back
0.0    4381
1.0     652
Name: count, dtype: int64

body_parts_genitalia_or_groin
0.0    4811
1.0     222
Name: count, dtype: int64

body_parts_buttocks
0.0    4642
1.0     391
Name: count, dtype: int64

body_parts_leg
0.0    3339
1.0    1694
Name: count, dtype: int64

body_parts_foot_top_or_side
0.0    4618
1.0     415
Name: count, dtype: int64

body_parts_foot_sole
0.0    4929
1.0     104
Name: count, dtype: int64

body_parts_other
0.0    4490
1.0     543
Name: count, dtype: int64



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered[col_name].replace('YES', 1, inplace = True)
  df_filtered[col_name].replace('YES', 1, inplace = True)


In [None]:
symptoms_col_start = df_original.columns.get_loc('condition_symptoms_bothersome_appearance')
symptoms_col_end = df_original.columns.get_loc('other_symptoms_no_relevant_symptoms')
symptoms_cols_list = df_filtered.iloc[:, symptoms_col_start:symptoms_col_end + 1 ]

for col in symptoms_cols_list:
  convert_to_binary_var(col)

condition_symptoms_bothersome_appearance
0.0    3503
1.0    1530
Name: count, dtype: int64

condition_symptoms_bleeding
0.0    4808
1.0     225
Name: count, dtype: int64

condition_symptoms_increasing_size
0.0    4020
1.0    1013
Name: count, dtype: int64

condition_symptoms_darkening
0.0    4648
1.0     385
Name: count, dtype: int64

condition_symptoms_itching
1.0    2712
0.0    2321
Name: count, dtype: int64

condition_symptoms_burning
0.0    4045
1.0     988
Name: count, dtype: int64

condition_symptoms_pain
0.0    4299
1.0     734
Name: count, dtype: int64

condition_symptoms_no_relevant_experience
0.0    4725
1.0     308
Name: count, dtype: int64

other_symptoms_fever
0.0    4943
1.0      90
Name: count, dtype: int64

other_symptoms_chills
0.0    4913
1.0     120
Name: count, dtype: int64

other_symptoms_fatigue
0.0    4618
1.0     415
Name: count, dtype: int64

other_symptoms_joint_pain
0.0    4733
1.0     300
Name: count, dtype: int64

other_symptoms_mouth_sores
0.0    4937
1.0 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered[col_name].replace('YES', 1, inplace = True)
  df_filtered[col_name].replace('YES', 1, inplace = True)


In [None]:
df_filtered['related_category'].fillna("NO_RESPONSE", inplace=True)
df_filtered['condition_duration'].fillna("NO_RESPONSE", inplace=True)
df_filtered['image_2_shot_type'].fillna("N/A", inplace=True)
df_filtered['image_3_shot_type'].fillna("N/A", inplace=True)
df_filtered['combined_race'].fillna("NO_RESPONSE", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered['related_category'].fillna("NO_RESPONSE", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered['condition_duration'].fillna("NO_RESPONSE", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

 ## Image Cleaning

In [None]:
from google.cloud import storage
# List all images in dataset/images/
all_images = []
for blob in Globals.gcs_bucket.list_blobs(prefix=Globals.gcs_images_dir):
    if blob.name.endswith((".jpg",".jpeg",".png")):
        all_images.append(blob.name)

In [None]:
# Problematic images identified by CleanVision
to_drop = [
    "dataset/images/-4593817128438983108.png",
    "dataset/images/-2431769699504014881.png"
]

# Columns that contain image paths
image_cols = ['image_1_path', 'image_2_path', 'image_3_path']

# Build a boolean mask for rows that have any of these images
mask = df_filtered[image_cols].isin(to_drop).any(axis=1)

# Apply filter (keep only rows without those images)
df_filtered = df_filtered[~mask].copy()

In [None]:
save_dir = "/content/drive/My Drive/BTT_Skinterest_2A/Dataset"
os.makedirs(save_dir, exist_ok=True)

In [None]:
save_path = os.path.join(save_dir, "image_quality_report.csv")
quality_df = pd.read_csv(save_path)
print("Loaded", len(quality_df), "rows")
quality_df.head()

Loaded 10379 rows


Unnamed: 0,image_path,blur,brightness_mean,brightness_std,underexp,overexp,contrast,shadow
0,dataset/images/-1001492676369731180.png,49.8009,125.822289,62.991703,0.152014,0.049442,0.988142,0.162389
1,dataset/images/-1001733364362669777.png,238.390468,74.623044,29.900723,0.03618,0.000554,1.0,0.03618
2,dataset/images/-1003800477193786941.png,89.631693,128.77948,47.623139,0.005774,0.023419,0.980237,0.055794
3,dataset/images/-1005922060850163675.png,4.17201,102.138025,62.690025,0.179942,0.0,0.99061,0.010904
4,dataset/images/-1007969568196430462.png,9.460231,147.59777,40.054561,0.0,0.002477,0.819608,0.107458


In [None]:
# Problematic images identified by CleanVision
to_drop_2 = [
    "dataset/images/4207723573736028617.png",
]

mask_2 = df_filtered[image_cols].isin(to_drop_2).any(axis=1)
df_filtered = df_filtered.loc[~mask_2]

print(f"Removed {mask_2.sum()} rows containing problematic images.")
print(f" Filtered dataset now has {len(df_filtered)} rows.")

Removed 1 rows containing problematic images.
 Filtered dataset now has 5030 rows.


In [None]:
save_path = os.path.join(save_dir, "image_quality_report_with_sharpness.csv")
quality_df = pd.read_csv(save_path)
print("Loaded", len(quality_df), "rows")
quality_df.head()

Loaded 10379 rows


Unnamed: 0,image_path,blur,brightness_mean,brightness_std,underexp,overexp,contrast,shadow,sharpness
0,dataset/images/-1001492676369731180.png,49.8009,125.822289,62.991703,0.152014,0.049442,0.988142,0.162389,49.8009
1,dataset/images/-1001733364362669777.png,238.390468,74.623044,29.900723,0.03618,0.000554,1.0,0.03618,238.390468
2,dataset/images/-1003800477193786941.png,89.631693,128.77948,47.623139,0.005774,0.023419,0.980237,0.055794,89.631693
3,dataset/images/-1005922060850163675.png,4.17201,102.138025,62.690025,0.179942,0.0,0.99061,0.010904,4.17201
4,dataset/images/-1007969568196430462.png,9.460231,147.59777,40.054561,0.0,0.002477,0.819608,0.107458,9.460231
