In [3]:
import sys
import pandas as pd
from pathlib import Path

sys.path.append('..') 

from src.config import config
from src.utils import seed_everything, make_stratified_split

# Setup
seed_everything(config.SEED)

 Global seed set to 42


In [5]:
df = pd.read_csv(config.METADATA_PATH)
print(f"Original Records: {len(df)}")

def find_image_path(image_id):
    p1 = config.IMAGE_DIR_PART1 / f"{image_id}.jpg"
    if p1.exists(): return str(p1)
    
    p2 = config.IMAGE_DIR_PART2 / f"{image_id}.jpg"
    if p2.exists(): return str(p2)
    
    return None

# find img 
print("üîç Searching for images...")
df['image_path'] = df['image_id'].apply(find_image_path)

# check missing img
missing = df['image_path'].isna().sum()
print(f"Missing Images: {missing}")

# delete missing img
df = df.dropna(subset=['image_path'])

Original Records: 10015
üîç Searching for images...
Missing Images: 0


In [9]:
# Split data by Lesion ID
df = make_stratified_split(
    df, 
    test_size=config.TEST_SIZE, 
    val_size=config.VAL_SIZE, 
    seed=config.SEED
)

print("Split Result:")
print(df['split'].value_counts())

Split Result:
split
train    7054
test     1497
val      1464
Name: count, dtype: int64


In [27]:
# Save CSV 
train_df = df[df['split'] == 'train']
val_df = df[df['split'] == 'val']
test_df = df[df['split'] == 'test']

train_df.to_csv(config.PROCESSED_DATA_DIR / 'train.csv', index=False)
val_df.to_csv(config.PROCESSED_DATA_DIR / 'val.csv', index=False)
test_df.to_csv(config.PROCESSED_DATA_DIR / 'test.csv', index=False)

print(f" Saved CSVs to {config.PROCESSED_DATA_DIR}")

# Show table draft 
dist_df = df.groupby(['split', 'dx']).size().unstack(fill_value=0)
display(dist_df)

 Saved CSVs to /Users/mati/adv_skin_cancer/data/processed


dx,akiec,bcc,bkl,df,mel,nv,vasc
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
test,51,77,157,22,168,1000,22
train,233,365,775,76,777,4730,98
val,43,72,167,17,168,975,22


In [26]:
pretty_df = dist_df.T 
pretty_df = pretty_df[['train', 'val', 'test']]
pretty_df['Total'] = pretty_df.sum(axis=1)

pretty_df['% Train'] = (pretty_df['train'] / pretty_df['Total']) * 100
pretty_df['% Val']   = (pretty_df['val']   / pretty_df['Total']) * 100
pretty_df['% Test']  = (pretty_df['test']  / pretty_df['Total']) * 100

grand_total = pretty_df['Total'].sum()
pretty_df['% of All Data'] = (pretty_df['Total'] / grand_total) * 100

pretty_df = pretty_df.sort_values(by='Total', ascending=False)

styled_table = (
    pretty_df.style
        .background_gradient(
            cmap='Blues', 
            subset=['train', 'val', 'test', 'Total']
        )
        .format({
            'train': '{:,}',
            'val': '{:,}',
            'test': '{:,}',
            'Total': '{:,}',
            '% Train': '{:.1f}%',
            '% Val': '{:.1f}%',
            '% Test': '{:.1f}%',
            '% of All Data': '{:.2f}%'
        })
        .set_caption("Class Distribution with Percentages")
)

display(styled_table)


split,train,val,test,Total,% Train,% Val,% Test,% of All Data
dx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
nv,4730,975,1000,6705,70.5%,14.5%,14.9%,66.95%
mel,777,168,168,1113,69.8%,15.1%,15.1%,11.11%
bkl,775,167,157,1099,70.5%,15.2%,14.3%,10.97%
bcc,365,72,77,514,71.0%,14.0%,15.0%,5.13%
akiec,233,43,51,327,71.3%,13.1%,15.6%,3.27%
vasc,98,22,22,142,69.0%,15.5%,15.5%,1.42%
df,76,17,22,115,66.1%,14.8%,19.1%,1.15%
