In [1]:
!pip install matplotlib google-cloud-storage
!pip install transformers==4.39.3

Collecting transformers==4.39.3
  Downloading transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m134.8/134.8 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.39.3)
  Downloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.8/8.8 MB[0m [31m95.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.6/3.6 MB[0m [31m110.3 MB

In [2]:
import collections
import copy
import hashlib
import io
import os
import subprocess
import textwrap
import time
import glob

from typing import List, Text

from PIL import Image

import cv2
import numpy as np
import pandas as pd
import tabulate
import seaborn as sns
from tqdm import tqdm
import ast


import tensorflow as tf

import matplotlib.pyplot as plt

In [3]:
from google.colab import auth

# Authenticate user for access. There will be a popup asking you to sign in with your user and approve access.
auth.authenticate_user()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
#@title Global params

class Globals:
  # GCP project with GCS bucket of interest
  gcp_project = 'dx-scin-public' #@param

  # GCS bucket with data to read
  gcs_bucket_name = 'dx-scin-public-data' #@param

  # CSV of case metadata to read
  cases_csv = 'dataset/scin_cases.csv' #@param

  # CSV of label metadata to read
  labels_csv = 'dataset/scin_labels.csv' #@param

  # Images directory
  gcs_images_dir = 'dataset/images/' #@param

  ### Key column names
  image_path_columns = ['image_1_path', 'image_2_path', 'image_3_path']
  weighted_skin_condition_label = "weighted_skin_condition_label"
  skin_condition_label = "dermatologist_skin_condition_on_label_name"

  ###### Formed during execution:

  # Client for querying GCS
  gcs_storage_client = None

  # Bucket object for loading files
  gcs_bucket = None

  # pd.DataFrame for the loaded metadata_csv
  cases_df = None

  # pd.DataFrame for the loaded labels_csv
  cases_and_labels_df = None

print(f'GCS bucket name: {Globals.gcs_bucket_name}')
print(f'cases_csv: {Globals.cases_csv}')
print(f'labels_csv: {Globals.labels_csv}')
print(f'images dir: {Globals.gcs_images_dir}')

GCS bucket name: dx-scin-public-data
cases_csv: dataset/scin_cases.csv
labels_csv: dataset/scin_labels.csv
images dir: dataset/images/


In [6]:
#@title Create a dataframe that contains the metadata and condition labels

from google.cloud import storage

def list_blobs(storage_client, bucket_name):
  """Helper to list blobs in a bucket (useful for debugging)."""
  blobs = storage_client.list_blobs(bucket_name)
  for blob in blobs:
    print(blob)

def initialize_df_with_metadata(bucket, csv_path):
  """Loads the given CSV into a pd.DataFrame."""
  df = pd.read_csv(io.BytesIO(bucket.blob(csv_path).download_as_string()), dtype={'case_id': str})
  df['case_id'] = df['case_id'].astype(str)
  return df

def augment_metadata_with_labels(df, bucket, csv_path):
  """Loads the given CSV into a pd.DataFrame."""
  labels_df = pd.read_csv(io.BytesIO(bucket.blob(csv_path).download_as_string()), dtype={'case_id': str})
  labels_df['case_id'] = labels_df['case_id'].astype(str)
  merged_df = pd.merge(df, labels_df, on='case_id')
  return merged_df

Globals.gcs_storage_client = storage.Client(Globals.gcp_project)
Globals.gcs_bucket = Globals.gcs_storage_client.bucket(
    Globals.gcs_bucket_name
)
Globals.cases_df = initialize_df_with_metadata(Globals.gcs_bucket, Globals.cases_csv)
Globals.cases_and_labels_df = augment_metadata_with_labels(Globals.cases_df, Globals.gcs_bucket, Globals.labels_csv)
print(len(Globals.cases_and_labels_df))

5033


## Important Functions

**Variables**
* df_original: The full, unmodified dataset containing all cases, metadata, and image paths. This is equivalent to Globals.cases_and_labels_df and can be used like a normal pandas DataFrame.
* df_filtered: A working copy of the dataset that you can safely modify, filter, or clean without affecting the original.
* image_dir : Use this directory to access images within google cloud


**Functions**
* read_image_from_gcs(gcs_path)
  - Downloads and decodes an image directly from your GCS bucket using the path stored in the dataset (e.g. "dataset/images/12345.png").
*  get_all_image_paths(df):
  - Extracts all unique image paths from the three image columns (image_1_path, image_2_path, image_3_path) in the dataset.
* show_case_images(case_id)
  - Displays all available images for a given case_id directly from GCS.
* convert_to_binary_var(col_name)
  - convert to binary values
* load_image(path, label)

  Note: add instructions if we want to analyze/change/decode all images together

In [7]:
df_original = Globals.cases_and_labels_df

In [8]:
df_filtered = df_original.copy()

In [9]:
image_dir = "/content/gcs_mount/dataset/images"

In [10]:
def read_image_from_gcs(gcs_path):
    """Reads and decodes an image from GCS (relative path)."""
    try:
        blob = Globals.gcs_bucket.blob(gcs_path)
        img_bytes = blob.download_as_bytes()
        img_array = np.frombuffer(img_bytes, np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)
        if img is None:
            return None
        # Convert BGR (OpenCV) ‚Üí RGB (matplotlib)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img_rgb
    except Exception as e:
        print(f"Failed to read {gcs_path}: {e}")
        return None


In [11]:
def get_all_image_paths(df):
    """
    Flatten all image_1/2/3 columns from the given DataFrame
    into a unique list of image paths.
    """
    image_cols = ['image_1_path', 'image_2_path', 'image_3_path']

    all_paths = (
        df[image_cols]
        .stack()          # combine all image columns vertically
        .dropna()         # remove missing entries
        .unique()         # only keep unique paths
        .tolist()         # convert to list
    )

    print(f"üì∏ Found {len(all_paths)} unique image paths.")
    return all_paths


In [12]:
def show_case_images(case_id):
    """Display all available images for a given case_id."""
    row = Globals.cases_and_labels_df[Globals.cases_and_labels_df["case_id"] == str(case_id)]
    if row.empty:
        print(f"No case found for ID {case_id}")
        return

    paths = row[['image_1_path', 'image_2_path', 'image_3_path']].dropna(axis=1).values.flatten()
    plt.figure(figsize=(15, 5))
    for i, path in enumerate(paths, 1):
        img = read_image_from_gcs(path)
        plt.subplot(1, len(paths), i)
        plt.imshow(img)
        plt.title(f"{os.path.basename(path)}", fontsize=9)
        plt.axis("off")
    plt.suptitle(f"Case ID: {case_id}", fontsize=12)
    plt.tight_layout()
    plt.show()


In [13]:
def convert_to_binary_var(col_name):
  df_filtered[col_name].replace('YES', 1, inplace = True)
  df_filtered[col_name].fillna(0, inplace = True)
  print(df_filtered[col_name].value_counts())
  print("")


In [14]:
IMG_SIZE = (224, 224)

def load_image(path, label):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=3)
    image = tf.image.resize(image, IMG_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    label = tf.cast(label, tf.int32)   # IMPORTANT
    return image, label

 ## Image Cleaning

In [15]:
df_filtered_path = "/content/drive/My Drive/BTT_Skinterest_2A/Dataset/dermatologist_conditions_split.csv"
df_filtered = pd.read_csv(df_filtered_path)
df_filtered[:10]

  df_filtered = pd.read_csv(df_filtered_path)


Unnamed: 0,case_id,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,race_ethnicity_american_indian_or_alaska_native,race_ethnicity_asian,race_ethnicity_black_or_african_american,...,dermatologist_gradable_for_fitzpatrick_skin_type_2,dermatologist_gradable_for_fitzpatrick_skin_type_3,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,gradable_for_monk_skin_tone_india,gradable_for_monk_skin_tone_us,monk_skin_tone_label_india,monk_skin_tone_label_us,condition_weight
0,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST2,,,True,True,2.0,1.0,0.41
1,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST2,,,True,True,2.0,1.0,0.41
2,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST2,,,True,True,2.0,1.0,0.18
3,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST1,,,True,True,3.0,3.0,0.41
4,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST1,,,True,True,3.0,3.0,0.18
5,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST1,,,True,True,3.0,3.0,0.41
6,-1003358831658393077,SCIN,1.0.0,2023,AGE_18_TO_29,MALE,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST4,,,True,True,3.0,4.0,0.55
7,-1003358831658393077,SCIN,1.0.0,2023,AGE_18_TO_29,MALE,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST4,,,True,True,3.0,4.0,0.23
8,-1003358831658393077,SCIN,1.0.0,2023,AGE_18_TO_29,MALE,NONE_IDENTIFIED,0.0,0.0,0.0,...,,,FST4,,,True,True,3.0,4.0,0.23
9,-1003844406100696311,SCIN,1.0.0,2023,AGE_40_TO_49,FEMALE,FST3,0.0,0.0,0.0,...,,,FST1,,,True,True,1.0,1.0,0.33


In [16]:
from google.cloud import storage
# List all images in dataset/images/
all_images = []
for blob in Globals.gcs_bucket.list_blobs(prefix=Globals.gcs_images_dir):
    if blob.name.endswith((".jpg",".jpeg",".png")):
        all_images.append(blob.name)

In [17]:
# Problematic images identified by CleanVision
to_drop = [
    "dataset/images/-4593817128438983108.png",
    "dataset/images/-2431769699504014881.png"
]

# Columns that contain image paths
image_cols = ['image_1_path', 'image_2_path', 'image_3_path']

# Build a boolean mask for rows that have any of these images
mask = df_filtered[image_cols].isin(to_drop).any(axis=1)

# Apply filter (keep only rows without those images)
df_filtered = df_filtered[~mask].copy()

In [18]:
save_dir = "/content/drive/My Drive/BTT_Skinterest_2A/Dataset"
os.makedirs(save_dir, exist_ok=True)

In [19]:
save_path = os.path.join(save_dir, "image_quality_report.csv")
quality_df = pd.read_csv(save_path)
print("Loaded", len(quality_df), "rows")
quality_df.head()

Loaded 10379 rows


Unnamed: 0,image_path,blur,brightness_mean,brightness_std,underexp,overexp,contrast,shadow
0,dataset/images/-1001492676369731180.png,49.8009,125.822289,62.991703,0.152014,0.049442,0.988142,0.162389
1,dataset/images/-1001733364362669777.png,238.390468,74.623044,29.900723,0.03618,0.000554,1.0,0.03618
2,dataset/images/-1003800477193786941.png,89.631693,128.77948,47.623139,0.005774,0.023419,0.980237,0.055794
3,dataset/images/-1005922060850163675.png,4.17201,102.138025,62.690025,0.179942,0.0,0.99061,0.010904
4,dataset/images/-1007969568196430462.png,9.460231,147.59777,40.054561,0.0,0.002477,0.819608,0.107458


In [20]:
# Problematic images identified by CleanVision
to_drop_2 = [
    "dataset/images/4207723573736028617.png",
]

mask_2 = df_filtered[image_cols].isin(to_drop_2).any(axis=1)
df_filtered = df_filtered.loc[~mask_2]

print(f"Removed {mask_2.sum()} rows containing problematic images.")
print(f" Filtered dataset now has {len(df_filtered)} rows.")

Removed 0 rows containing problematic images.
 Filtered dataset now has 8209 rows.


In [21]:
save_path = os.path.join(save_dir, "image_quality_report_with_sharpness.csv")
quality_df = pd.read_csv(save_path)
print("Loaded", len(quality_df), "rows")
quality_df.head()

Loaded 8209 rows


  quality_df = pd.read_csv(save_path)


Unnamed: 0,case_id,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,race_ethnicity_american_indian_or_alaska_native,race_ethnicity_asian,race_ethnicity_black_or_african_american,...,dermatologist_gradable_for_fitzpatrick_skin_type_1,dermatologist_gradable_for_fitzpatrick_skin_type_2,dermatologist_gradable_for_fitzpatrick_skin_type_3,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,gradable_for_monk_skin_tone_india,gradable_for_monk_skin_tone_us,monk_skin_tone_label_india,monk_skin_tone_label_us
0,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,YES,,,FST2,,,True,True,2.0,1.0
1,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,YES,,,FST2,,,True,True,2.0,1.0
2,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,YES,,,FST2,,,True,True,2.0,1.0
3,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,YES,,,FST1,,,True,True,3.0,3.0
4,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,NONE_IDENTIFIED,0.0,0.0,0.0,...,YES,,,FST1,,,True,True,3.0,3.0


## Model Training Prep

In [22]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [23]:
# Select highest-confidence condition per case
train_df = df_filtered.copy()

train_df = train_df.loc[train_df.groupby("case_id")["condition_confidence"].idxmax()]
train_df = train_df.reset_index(drop=True)

# Assign top label
train_df["top_label"] = train_df["condition_name"]


In [24]:
image_cols = ["image_1_path", "image_2_path", "image_3_path"]

df_long = train_df.melt(
    id_vars=["case_id", "top_label"],
    value_vars=image_cols,
    var_name="image_num",
    value_name="image_path"
)

# Drop rows with missing image paths
df_long = df_long.dropna(subset=["image_path"])


In [25]:
df_long["image_path"] = "gs://" + Globals.gcs_bucket_name + "/" + df_long["image_path"].astype(str)

In [26]:
le = LabelEncoder()
df_long["label_id"] = le.fit_transform(df_long["top_label"])
num_classes = len(le.classes_)

print(f"Found {num_classes} unique labels.")


Found 210 unique labels.


In [27]:
# Remove classes that have fewer than 2 images
label_counts = df_long["label_id"].value_counts()
df_long = df_long[df_long["label_id"].isin(label_counts[label_counts >= 2].index)]

In [28]:
# First compute existence for ALL rows
df_long["file_exists"] = df_long["image_path"].apply(tf.io.gfile.exists)

# Extract invalid paths
invalid_paths = df_long[df_long["file_exists"] == False]["image_path"].tolist()

print("Number of invalid images:", len(invalid_paths))
invalid_paths[:20]   # preview first 20


Number of invalid images: 1


['gs://dx-scin-public-data/dataset/images/-2243186711511406658.png']

In [41]:
df_long = df_long[df_long["file_exists"] == True].drop(columns=["file_exists"])

KeyError: 'file_exists'

In [30]:
paths = df_long["image_path"].values
labels = df_long["label_id"].values


train_paths, test_paths, train_labels, test_labels = train_test_split(
    paths, labels,
    test_size=0.4,
    random_state=42,
    stratify=labels
)


In [31]:
train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
train_ds = train_ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
train_ds = train_ds.shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
test_ds = test_ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.batch(32).prefetch(tf.data.AUTOTUNE)


## ResNet 50 Model

In [32]:
from transformers import TFResNetModel, AutoImageProcessor
import tensorflow as tf
from tensorflow.keras import layers, models

processor = AutoImageProcessor.from_pretrained("microsoft/resnet-50")
resnet = TFResNetModel.from_pretrained("microsoft/resnet-50")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFResNetModel: ['resnet.encoder.stages.1.layers.0.layer.1.normalization.num_batches_tracked', 'resnet.encoder.stages.2.layers.2.layer.2.normalization.num_batches_tracked', 'resnet.encoder.stages.2.layers.4.layer.0.normalization.num_batches_tracked', 'resnet.encoder.stages.2.layers.3.layer.2.normalization.num_batches_tracked', 'resnet.encoder.stages.1.layers.3.layer.0.normalization.num_batches_tracked', 'resnet.encoder.stages.0.layers.0.layer.1.normalization.num_batches_tracked', 'resnet.encoder.stages.1.layers.2.layer.0.normalization.num_batches_tracked', 'resnet.encoder.stages.2.layers.0.shortcut.normalization.num_batches_tracked', 'resnet.encoder.stages.3.layers.2.layer.1.normalization.num_batches_tracked', 'resnet.encoder.stages.0.layers.0.layer.2.normalization.num_batches_tracked', 'resnet.embedder.embedder.normalization.num_batches_tracked', 'resnet.encoder.stages.2.layers.1.layer.0.normalization.nu

In [None]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50

# Base ResNet
resnet = ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=(224,224,3),
    pooling='avg'
)
resnet.trainable = False   # optional

image_input = layers.Input(shape=(224,224,3))
x = layers.Rescaling(1./255)(image_input)

x = resnet(x, training=False)   # works! accepts Keras tensors
x = layers.Dense(256, activation="relu")(x)
output = layers.Dense(num_classes, activation="softmax")(x)

model = models.Model(image_input, output)

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history = model.fit(train_ds, validation_data=test_ds, epochs=10)

model.save("resnet_no_metadata.keras")


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m5s[0m 0us/step
Epoch 1/10
[1m122/122[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m758s[0m 5s/step - accuracy: 0.1402 - loss: 4.2222 - val_accuracy: 0.1607 - val_loss: 3.8519
Epoch 2/10
[1m122/122[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m690s[0m 5s/step - accuracy: 0.1562 - loss: 3.9027 - val_accuracy: 0.1607 - val_loss: 3.8386
Epoch 3/10
[1m122/122[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m666s[0m 5s/step - accuracy: 0.1555 - loss: 3.8750 - val_accuracy: 0.1607 - val_loss: 3.8433
Epoch 4/10
[1m122/122[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m662s[0m 5s/s

In [None]:
model_save_dir = "/content/drive/My Drive/BTT_Skinterest_2A/Coding for Skinterest Tech 2A Project/Mahek_Models"
os.makedirs(model_save_dir, exist_ok=True)

model = os.path.join(model_save_dir, 'resnet.keras')
model.save(model)

print("Saved ResNet model to:", model)


AttributeError: 'str' object has no attribute 'save'

In [33]:
def load_image_only(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, (224,224))
    return img, label

train_ds = tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
train_ds = train_ds.map(load_image_only).batch(32).prefetch(tf.data.AUTOTUNE)

test_ds = tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
test_ds = test_ds.map(load_image_only).batch(32).prefetch(tf.data.AUTOTUNE)


In [34]:
print("Unique labels:", np.unique(train_labels)[:20])
print("Max label:", np.max(train_labels))
print("Min label:", np.min(train_labels))

Unique labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
Max label: 209
Min label: 0


In [35]:
train_df.columns


Index(['case_id', 'source', 'release', 'year', 'age_group', 'sex_at_birth',
       'fitzpatrick_skin_type',
       'race_ethnicity_american_indian_or_alaska_native',
       'race_ethnicity_asian', 'race_ethnicity_black_or_african_american',
       'race_ethnicity_hispanic_latino_or_spanish_origin',
       'race_ethnicity_middle_eastern_or_north_african',
       'race_ethnicity_native_hawaiian_or_pacific_islander',
       'race_ethnicity_white', 'race_ethnicity_other_race',
       'race_ethnicity_prefer_not_to_answer', 'textures_raised_or_bumpy',
       'textures_flat', 'textures_rough_or_flaky', 'textures_fluid_filled',
       'body_parts_head_or_neck', 'body_parts_arm', 'body_parts_palm',
       'body_parts_back_of_hand', 'body_parts_torso_front',
       'body_parts_torso_back', 'body_parts_genitalia_or_groin',
       'body_parts_buttocks', 'body_parts_leg', 'body_parts_foot_top_or_side',
       'body_parts_foot_sole', 'body_parts_other',
       'condition_symptoms_bothersome_appearan

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df["label_id"] = le.fit_transform(train_df["top_label"])

body_part_cols = [
    'body_parts_head_or_neck',
    'body_parts_arm',
    'body_parts_palm',
    'body_parts_back_of_hand',
    'body_parts_torso_front',
    'body_parts_torso_back',
    'body_parts_genitalia_or_groin',
    'body_parts_buttocks',
    'body_parts_leg',
    'body_parts_foot_top_or_side',
    'body_parts_foot_sole',
    'body_parts_other'
]

df_long = train_df.melt(
    id_vars=["case_id", "top_label", "label_id"]
            + body_part_cols,
    value_vars=image_cols,
    var_name="image_num",
    value_name="image_path"
)

df_long = df_long.dropna(subset=["image_path"])


In [None]:
df_long[["image_path", "label_id"]].head()


Unnamed: 0,image_path,label_id
0,dataset/images/3422278879386892670.png,176
1,dataset/images/-3162371366187734223.png,56
2,dataset/images/-4063990915557730380.png,99
3,dataset/images/-1590080144874886066.png,190
4,dataset/images/-8500170905733656840.png,56


In [None]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
import os

# Condition Symptoms Metadata columns
condition_symptoms_cols = [
 'condition_symptoms_bothersome_appearance',
 'condition_symptoms_bleeding',
 'condition_symptoms_increasing_size',
 'condition_symptoms_darkening',
 'condition_symptoms_itching',
 'condition_symptoms_burning',
 'condition_symptoms_pain',
 'condition_symptoms_no_relevant_experience',
]

metadata_cols = condition_symptoms_cols

num_classes = df_long["label_id"].nunique()     # condition classes


In [None]:
resnet = ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3),
    pooling='avg'
)
resnet.trainable = False


In [None]:
# ----- IMAGE INPUT -----
image_input = layers.Input(shape=(224, 224, 3), name="image")
x = layers.Rescaling(1./255)(image_input)
x = resnet(x, training=False)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.3)(x)

# ----- METADATA INPUT -----
meta_input = layers.Input(shape=(len(metadata_cols),), name="meta")
m = layers.Dense(64, activation="relu")(meta_input)
m = layers.Dropout(0.2)(m)

# ----- FUSION -----
combined = layers.Concatenate()([x, m])
h = layers.Dense(128, activation="relu")(combined)
h = layers.Dropout(0.3)(h)

output = layers.Dense(num_classes, activation="softmax")(h)

model = models.Model(inputs=[image_input, meta_input], outputs=output)


In [None]:
optimizer = tf.keras.optimizers.Adam(1e-4, clipnorm=1.0)

model.compile(
    optimizer=optimizer,
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


In [None]:
history = model.fit(
    train_ds,          # ({'image':..., 'meta':...}, label_id)
    validation_data=test_ds,
    epochs=10
)
model.save("resnet_condition_model.keras")

model_save_dir = "/content/drive/My Drive/BTT_Skinterest_2A/Coding for Skinterest Tech 2A Project/Mahek_Models"
os.makedirs(model_save_dir, exist_ok=True)

model_path = os.path.join(model_save_dir, 'resnet_condition_model.keras')
model.save(model_path)

print("Saved condition model to:", model_path)


Epoch 1/10


ValueError: Layer "functional" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'data:0' shape=(None, 224, 224, 3) dtype=float32>]

In [42]:
condition_symptoms_cols = [
 'condition_symptoms_bothersome_appearance',
 'condition_symptoms_bleeding',
 'condition_symptoms_increasing_size',
 'condition_symptoms_darkening',
 'condition_symptoms_itching',
 'condition_symptoms_burning',
 'condition_symptoms_pain',
 'condition_symptoms_no_relevant_experience',
]

# 1) MELT USING df_filtered (not df)
df_long = df_filtered.melt(
    id_vars=["case_id"] + condition_symptoms_cols,
    value_vars=image_cols,
    var_name="img_num",
    value_name="image_path"
)

# 2) keep only rows with real image paths
df_long = df_long.dropna(subset=["image_path"]).reset_index(drop=True)

# 3) prepend gs://bucket_name/
df_long["image_path"] = (
    "gs://" + Globals.gcs_bucket_name + "/" + df_long["image_path"].astype(str)
)

# 4) verify existence
df_long["file_exists"] = df_long["image_path"].apply(tf.io.gfile.exists)
df_long = df_long[df_long["file_exists"] == True].drop(columns=["file_exists"])

# 5) create condition symptoms label
df_long["condition_symptoms_label"] = df_long[condition_symptoms_cols].idxmax(axis=1)
label_map = {col: i for i, col in enumerate(condition_symptoms_cols)}
df_long["condition_symptoms_label"] = df_long["condition_symptoms_label"].map(label_map).astype("int32")

# 6) extract paths + labels
paths = df_long["image_path"].values
labels = df_long["condition_symptoms_label"].values

# 7) split
train_paths, test_paths, train_labels, test_labels = train_test_split(
    paths, labels, test_size=0.4, random_state=42, stratify=labels
)


In [43]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
import os
# 8) dataset loader
def load_image(path, label):
    img = tf.io.read_file(path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, (224,224))
    img = tf.cast(img, tf.float32) / 255.0
    return img, label

train_ds = (tf.data.Dataset.from_tensor_slices((train_paths, train_labels))
            .map(load_image).shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE))

test_ds = (tf.data.Dataset.from_tensor_slices((test_paths, test_labels))
           .map(load_image).batch(32).prefetch(tf.data.AUTOTUNE))

# 9) simple ResNet
num_classes = len(condition_symptoms_cols)
resnet = ResNet50(include_top=False, weights="imagenet",
                  input_shape=(224,224,3), pooling="avg")
resnet.trainable = False

image_input = layers.Input(shape=(224,224,3))
x = layers.Rescaling(1./255)(image_input)
x = resnet(x, training=False)
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.3)(x)
output = layers.Dense(num_classes, activation="softmax")(x)

model = models.Model(image_input, output)
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

history = model.fit(train_ds, validation_data=test_ds, epochs=10)


Epoch 1/10
[1m348/348[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1263s[0m 3s/step - accuracy: 0.4969 - loss: 1.4118 - val_accuracy: 0.5159 - val_loss: 1.3497
Epoch 2/10
[1m348/348[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1185s[0m 3s/step - accuracy: 0.5123 - loss: 1.3655 - val_accuracy: 0.5159 - val_loss: 1.3512
Epoch 3/10
[1m348/348[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1074s[0m 3s/step - accuracy: 0.5174 - loss: 1.3520 - val_accuracy: 0.5159 - val_loss: 1.3388
Epoch 4/10
[1m348/348[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m1135s[0m 3s/step - accuracy: 0.5179 - loss: 1.3471 - val_accuracy: 0.5159 - val_loss: 1.3319
Epoch 5/10


KeyboardInterrupt: 

In [44]:
#Save model
model_save_dir = '/content/drive/My Drive/BTT_Skinterest_2A/Coding for Skinterest Tech 2A Project/Nivi_Models'
os.makedirs(model_save_dir, exist_ok=True)
model_path = os.path.join(model_save_dir, 'resnet_condition_symptoms_classifier_1.keras')
model.save(model_path)
#model.save("resnet_condition_symptoms_classifier.keras")

# Monk Scale

In [45]:
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
import os

monk_cols = [
 'gradable_for_monk_skin_tone_india',
 'gradable_for_monk_skin_tone_us',
 'monk_skin_tone_label_india',
 'monk_skin_tone_label_us'
]
num_classes = len(monk_cols)

resnet = ResNet50(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3),
    pooling='avg'
)
resnet.trainable = False  # freeze base CNN (optional)

image_input = layers.Input(shape=(224, 224, 3))
x = layers.Rescaling(1./255)(image_input)

# Feature extractor
x = resnet(x, training=False)

# Classification head
x = layers.Dense(256, activation="relu")(x)
output = layers.Dense(num_classes, activation="softmax")(x)

model_monk_scale = models.Model(inputs=image_input, outputs=output)

model_monk_scale.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',  # labels should be integers 0‚Äì3
    metrics=['accuracy']
)

history = model_monk_scale.fit(
    train_ds,
    validation_data=test_ds,
    epochs=10
)

# model_monk_scale.save("resnet_textures_no_metadata.keras")

# model_save_dir = "/content/drive/My Drive/BTT_Skinterest_2A/Coding for Skinterest Tech 2A Project/Mahek_Models"
# os.makedirs(model_save_dir, exist_ok=True)

# model_path_texture = os.path.join(model_save_dir, 'resnet_textures.keras')
# model.save(model_path_texture)

# print("Saved texture ResNet model to:", model_path_texture)


Epoch 1/10
[1m 23/348[0m [32m‚îÅ[0m[37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m12:19[0m 2s/step - accuracy: 0.4376 - loss: nan

KeyboardInterrupt: 

In [None]:
#Save model
model_save_dir = '/content/drive/My Drive/BTT_Skinterest_2A/Coding for Skinterest Tech 2A Project/Nivi_Models'
os.makedirs(model_save_dir, exist_ok=True)
model_path = os.path.join(model_save_dir, 'resnet_monk_scale_classifier.keras')
model_monk_scale.save(model_path)