In [1]:
import torch
import random
import numpy as np
import os
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Standard library
import json
import os
import random
import time

# Third-party libraries
import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import torch

# Scikit-learn - core modules
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight

# Scikit-learn - metrics
from sklearn.metrics import (
    accuracy_score, average_precision_score, balanced_accuracy_score,
    ConfusionMatrixDisplay, f1_score, log_loss,
    matthews_corrcoef, mean_squared_error, precision_score,
    PrecisionRecallDisplay, r2_score, recall_score, roc_auc_score, RocCurveDisplay
)

# Local application/library imports
from utils import load_search_space, get_model


## DATASET

In [2]:
SEED = 64

# Set random seeds
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# Dataset Info
# adult_income_cleaned, framingham_cleaned, preprocessed_heloc, diabetes
dataset_name = 'adult'        
dataset_subpath = 'Binary/adult'       
task_type = 'Binary'

In [4]:
df = pd.read_csv(f"./data/{dataset_subpath}/{dataset_name}.csv")

In [5]:
df.shape

(48842, 15)

In [6]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802.0,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40,United-States,<=50K
1,38,Private,89814.0,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50,United-States,<=50K
2,28,Local-gov,336951.0,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40,United-States,>50K
3,44,Private,160323.0,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40,United-States,>50K
4,18,,103497.0,Some-college,10,Never-married,,Own-child,White,Female,0.0,0.0,30,United-States,<=50K


## LOAD AND PREPROCESS

In [8]:
def preprocess_data(df, dataset_name, task_type, seed=42):
    task_type = task_type.lower()

    # Load config
    with open(f"./configs/preprocess/{dataset_name}.json") as f:
        config = json.load(f)

    categorical_cols = config["categorical_cols"]
    numerical_cols = config["numerical_cols"]
    encoding = config["encoding"]

    # Extract features and target
    X = df[numerical_cols + categorical_cols].copy()
    y = df.iloc[:, -1].copy()

    # Encode target if needed
    le = None
    if encoding.get("target") == "label":
        le = LabelEncoder()
        y = le.fit_transform(y)
        label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    else:
        label_mapping = None

    # Split raw data before transformation
    if task_type == "regression":
        # For regression, we can use a simple split
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=seed
        )
        X_val_raw, X_test_raw, y_val, y_test = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed
        )
    else:
        # For classification, we need stratified splits
        X_train_raw, X_temp_raw, y_train, y_temp = train_test_split(
            X, y, test_size=0.3, random_state=seed, stratify=y
        )
        X_val_raw, X_test_raw, y_val, y_test = train_test_split(
            X_temp_raw, y_temp, test_size=0.5, random_state=seed, stratify=y_temp
        )

    # Ensure y_* are Series with index matching the X_*
    y_train = pd.Series(y_train, index=X_train_raw.index)
    y_val = pd.Series(y_val, index=X_val_raw.index)
    y_test = pd.Series(y_test, index=X_test_raw.index)

    # Compute class weights for classification
    class_weight = None
    if task_type in ["binary", "multiclass"]:
        class_weight_values = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
        class_weight = dict(zip(np.unique(y_train), class_weight_values))
        print(f"Class weights: {class_weight}")

    # Transform numerical and categorical features
    transformers = []

    if encoding["numerical_features"] == "minmax":
        transformers.append(("num", MinMaxScaler(), numerical_cols))
    elif encoding["numerical_features"] == "standard":
        transformers.append(("num", StandardScaler(), numerical_cols))

    if categorical_cols and encoding["categorical_features"] == "onehot":
        transformers.append(("cat", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_cols))

    if transformers:
        preprocessor = ColumnTransformer(transformers=transformers)
        X_train = preprocessor.fit_transform(X_train_raw)
        X_val = preprocessor.transform(X_val_raw)
        X_test = preprocessor.transform(X_test_raw)

        # Recover transformed column names
        if "cat" in preprocessor.named_transformers_:
            cat_feature_names = preprocessor.named_transformers_["cat"].get_feature_names_out(categorical_cols)
            all_feature_names = numerical_cols + list(cat_feature_names)
        else:
            all_feature_names = numerical_cols + categorical_cols

        X_train = pd.DataFrame(X_train, columns=all_feature_names, index=X_train_raw.index)
        X_val = pd.DataFrame(X_val, columns=all_feature_names, index=X_val_raw.index)
        X_test = pd.DataFrame(X_test, columns=all_feature_names, index=X_test_raw.index)
    else:
        all_feature_names = numerical_cols + categorical_cols  # or keep original order
        X_train = pd.DataFrame(X_train_raw, columns=all_feature_names, index=X_train_raw.index)
        X_val = pd.DataFrame(X_val_raw, columns=all_feature_names, index=X_val_raw.index)
        X_test = pd.DataFrame(X_test_raw, columns=all_feature_names, index=X_test_raw.index)

    print(f"Shapes — Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
    print(f"Numerical features: {len(numerical_cols)} — {numerical_cols}")
    print(f"Categorical features: {len(categorical_cols)} — {categorical_cols}")
    print(f"Total features: {X_train.shape[1]}")
    if label_mapping:
        print(f"Target label mapping: {label_mapping}")

    return (
        X_train, X_val, X_test,
        y_train, y_val, y_test,
        None, le, class_weight
    )


## Generate Images

In [9]:
X_train, X_val, X_test, y_train, y_val, y_test, categorical_cols, label_encoder, class_weight = preprocess_data(df, dataset_name=dataset_name, task_type=task_type, seed=SEED)

Class weights: {0: 0.6572785296831745, 1: 2.089536731450923}
Shapes — Train: (34189, 108), Val: (7326, 108), Test: (7327, 108)
Numerical features: 6 — ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
Categorical features: 8 — ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
Total features: 108
Target label mapping: {'<=50K': 0, '>50K': 1}


In [10]:
# 2. Concatenate X and y (label as last column)
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

print(f"Shapes — Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

Shapes — Train: (34189, 109), Val: (7326, 109), Test: (7327, 109)


In [15]:
from generate_images import generate_images_from_config, load_config

In [None]:
config_path = f"configs/image_generation/{dataset_name}.json"
config = load_config(config_path)

generate_images_from_config(
    config=config,
    X_train=train_df,
    X_val=val_df,
    X_test=test_df,
)

Generating images with config: TINTO_blur

Generating images with config: TINTO_noblur

Generating images with config: IGTD

[IGTD] Auto-calculated image size: [11, 11]
Generating images with config: REFINED





In [14]:
from pathlib import Path
import os
from PIL import Image

# Base folder containing train/val/test
BASE = Path(f"SyntheticImages/{task_type}/{dataset_name}/BIE")
SPLITS = ["train", "val", "test"]
SUB = "images"

# Padding background color
RGB_PAD_COLOR = (0, 0, 0)  # white

def pad_to_square_right_or_bottom(im: Image.Image) -> Image.Image:
    """Pad image to square by extending to the right (if wider) or bottom (if taller) with black background."""
    w, h = im.size
    if w == h:
        return im  # already square

    size = max(w, h)

    # Ensure RGB mode
    if im.mode != "RGB":
        im = im.convert("RGB")

    new_im = Image.new("RGB", (size, size), RGB_PAD_COLOR)

    if w > h:
        # wider → paste at top-left, pad bottom
        new_im.paste(im, (0, 0))
    else:
        # taller → paste at top-left, pad right
        new_im.paste(im, (0, 0))

    return new_im

def is_image(fname: str) -> bool:
    return fname.lower().endswith((".png", ".jpg", ".jpeg"))

# Run the padding
count_total, count_padded, count_skipped = 0, 0, 0
for split in SPLITS:
    img_dir = BASE / split / SUB
    if not img_dir.exists():
        print(f"[warn] missing dir: {img_dir}")
        continue
    for fname in os.listdir(img_dir):
        if not is_image(fname):
            continue
        path = img_dir / fname
        try:
            with Image.open(path) as im:
                count_total += 1
                if im.size[0] == im.size[1]:
                    count_skipped += 1
                    continue
                out = pad_to_square_one_side(im)
                out.save(path, format=im.format)  # overwrite
                count_padded += 1
        except Exception as e:
            print(f"[error] {path}: {e}")

print(f"\nDone. Total: {count_total}, padded: {count_padded}, already square: {count_skipped}")


[warn] missing dir: SyntheticImages/Binary/adult/BIE/train/images
[warn] missing dir: SyntheticImages/Binary/adult/BIE/val/images
[warn] missing dir: SyntheticImages/Binary/adult/BIE/test/images

Done. Total: 0, padded: 0, already square: 0


In [15]:
from pathlib import Path
from PIL import Image

# Base folder containing train/val/test
BASE = Path(f"SyntheticImages/{task_type}/{dataset_name}/BIE")
SPLITS = ["train", "val", "test"]

# Padding background color
RGB_PAD_COLOR = (0, 0, 0)  # black


def pad_to_square_right_or_bottom(im: Image.Image) -> Image.Image:
    """Pad image to square by extending to the right (if wider) or bottom (if taller) with black background."""
    w, h = im.size
    if w == h:
        return im  # already square

    size = max(w, h)

    # Ensure RGB mode
    if im.mode != "RGB":
        im = im.convert("RGB")

    new_im = Image.new("RGB", (size, size), RGB_PAD_COLOR)

    if w > h:
        # wider → paste at top-left, pad bottom
        new_im.paste(im, (0, 0))
    else:
        # taller → paste at top-left, pad right
        new_im.paste(im, (0, 0))

    return new_im


def is_image(fname: str) -> bool:
    return fname.lower().endswith((".png", ".jpg", ".jpeg"))


# Run the padding
count_total, count_padded, count_skipped = 0, 0, 0
for split in SPLITS:
    split_dir = BASE / split
    if not split_dir.exists():
        print(f"[warn] missing dir: {split_dir}")
        continue

    # Loop over folders directly inside train/val/test
    for class_dir in split_dir.iterdir():
        if not class_dir.is_dir():
            print(f"[skip non-folder] {class_dir}")
            continue

        for path in class_dir.iterdir():
            if not path.is_file() or not is_image(path.name):
                continue
            try:
                with Image.open(path) as im:
                    count_total += 1
                    if im.size[0] == im.size[1]:
                        count_skipped += 1
                        continue
                    out = pad_to_square_right_or_bottom(im)
                    out.save(path, format=im.format)  # overwrite
                    count_padded += 1
            except Exception as e:
                print(f"[error] {path}: {e}")

print(f"\nDone. Total: {count_total}, padded: {count_padded}, already square: {count_skipped}")


[skip non-folder] SyntheticImages/Binary/adult/BIE/train/supervised.csv
[skip non-folder] SyntheticImages/Binary/adult/BIE/val/supervised.csv
[skip non-folder] SyntheticImages/Binary/adult/BIE/test/supervised.csv

Done. Total: 48842, padded: 48842, already square: 0
