In [None]:
# ruff: noqa: E402
import math
import warnings
from typing import Dict, Literal

warnings.simplefilter("ignore")
import delu  # Deep Learning Utilities: https://github.com/Yura52/delu
import numpy as np
import scipy.special
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.preprocessing
import torch
import torch.nn.functional as F
import torch.optim
from torch import Tensor
from tqdm.std import tqdm
import json
import sys

warnings.resetwarnings()

from rtdl_revisiting_models import MLP, ResNet, FTTransformer

sys.path.append('..')

from interpretDistill.fourierDistill import *
from interpretDistill.binaryTransformer import *

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Set random seeds in all libraries.
delu.random.seed(0)

In [1]:
t = {'j':0, 'l':2}

In [2]:
t[0]

KeyError: 0

## Dataset

In [None]:
# >>> Dataset.
TaskType = Literal["regression", "binclass", "multiclass"]

task_type: TaskType = "regression"
n_classes = None
dataset = sklearn.datasets.fetch_california_housing(as_frame = True)
X: np.ndarray = dataset["data"]
Y: np.ndarray = dataset["target"]

all_idx = np.arange(len(Y))
trainval_idx, test_idx = sklearn.model_selection.train_test_split(
    all_idx, train_size=0.8, random_state = 0
)
train_idx, val_idx = sklearn.model_selection.train_test_split(
    trainval_idx, train_size=0.8, random_state = 0
)

X_b = {}
bt = BinaryTransformer(depth = 3, bit = True)
X_b['train'] = bt.fit_and_transform(X.loc[train_idx, :], Y.loc[train_idx])
X_b['val'] = bt.transform(X.loc[val_idx, :])
X_b['test'] = bt.transform(X.loc[test_idx, :])


# >>> Continuous features.
#X_cont: np.ndarray = X[['carat', 'depth', 'table', 'x', 'y', 'z']].to_numpy().astype(np.float32)
n_cont_features = 0

# >>> Categorical features.
# NOTE: the above datasets do not have categorical features, but,
# for the demonstration purposes, it is possible to generate them.

cat_cardinalities = [len(X_b['train'][c].value_counts()) for c in X_b['train'].columns]
print(cat_cardinalities)


for k in X_b.keys():
    X_b[k] = (X_b[k] + 1) // 2

# >>> Labels.
# Regression labels must be represented by float32.

Y = Y.to_numpy()
if task_type == "regression":
    Y = Y.astype(np.float32)
else:
    assert n_classes is not None
    Y = Y.astype(np.int64)
    assert set(Y.tolist()) == set(
        range(n_classes)
    ), "Classification labels must form the range [0, 1, ..., n_classes - 1]"

data_numpy = {
    "train": {"x_cat": X_b['train'].to_numpy().astype(np.int64), "y": Y[train_idx]},
    "val": {"x_cat": X_b['val'].to_numpy().astype(np.int64), "y": Y[val_idx]},
    "test": {"x_cat": X_b['test'].to_numpy().astype(np.int64), "y": Y[test_idx]},
}

In [None]:
data_numpy['val']['x_cat'].shape

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def separate_features(df):
    categorical_features = []
    continuous_features = []
    
    for column in df.columns:
        if df[column].dtype == 'object':
            categorical_features.append(column)
        elif pd.api.types.is_numeric_dtype(df[column]):
            unique_values = df[column].nunique()
            if unique_values <= 2:
                categorical_features.append(column)
            else:
                continuous_features.append(column)
        else:
            raise ValueError(f"Unsupported column type: {df[column].dtype}")
    
    df_categorical = df[categorical_features]
    df_continuous = df[continuous_features]
    
    # Optional: Encode categorical features
    label_encoders = {}
    for column in categorical_features:
        le = LabelEncoder()
        df_categorical[column] = le.fit_transform(df_categorical[column])
        label_encoders[column] = le
    
    return df_continuous, df_categorical, label_encoders

# Usage example
data = {
    'age': [25, 32, 47, 51],
    'salary': [50000, 60000, 70000, 80000],
    'gender': ['male', 'female', 'female', 'male'],
    'is_employed': [1, 0, 1, 1]
}

df = pd.DataFrame(data)
df_continuous, df_categorical, label_encoders = separate_features(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical[column] = le.fit_transform(df_categorical[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical[column] = le.fit_transform(df_categorical[column])


In [34]:
from sklearn.datasets import fetch_california_housing

In [35]:
iris = fetch_california_housing(as_frame = True)
iris_df = iris.data
iris_df['target'] = iris.target

In [40]:
iris_df['target'].to_numpy()

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [37]:
df_numerical_features = iris_df.select_dtypes(include=['float64', 'float32'])
df_categorical_features = iris_df.drop(columns = df_numerical_features.columns)

In [42]:
~False

-1

In [16]:
df_categorical_features

Unnamed: 0,gender
0,male
1,female
2,female
3,male


## Preprocessing

In [None]:
if task_type == "regression":
    Y_mean = data_numpy["train"]["y"].mean().item()
    Y_std = data_numpy["train"]["y"].std().item()
    for part in data_numpy:
        data_numpy[part]["y"] = (data_numpy[part]["y"] - Y_mean) / Y_std

# >>> Convert data to tensors.
data = {
    part: {'y': torch.as_tensor(data_numpy[part]['y'], device=device)}
    for part in data_numpy
}

if True:
    data["train"]["x_cat"] = torch.from_numpy(data_numpy["train"]["x_cat"]).to(torch.int64).to(device)
    data["val"]["x_cat"] = torch.from_numpy(data_numpy["val"]["x_cat"]).to(torch.int64).to(device)
    data["test"]["x_cat"] = torch.from_numpy(data_numpy["test"]["x_cat"]).to(torch.int64).to(device)

if task_type != "multiclass":
    # Required by F.binary_cross_entropy_with_logits
    for part in data:
        data[part]["y"] = data[part]["y"].float()

## Model

In [None]:
# The output size.
d_out = n_classes if task_type == "multiclass" else 1

# # NOTE: uncomment to train MLP
# model = MLP(
#     d_in=n_cont_features + sum(cat_cardinalities),
#     d_out=d_out,
#     n_blocks=2,
#     d_block=384,
#     dropout=0.1,
# ).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-5)

# # NOTE: uncomment to train ResNet
# model = ResNet(
#     d_in=n_cont_features + sum(cat_cardinalities),
#     d_out=d_out,
#     n_blocks=2,
#     d_block=192,
#     d_hidden=None,
#     d_hidden_multiplier=2.0,
#     dropout1=0.3,
#     dropout2=0.0,
# ).to(device)
# optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-5)

model = FTTransformer(
    n_cont_features=n_cont_features,
    cat_cardinalities=cat_cardinalities,
    d_out=d_out,
    **FTTransformer.get_default_kwargs(),
).to(device)
optimizer = model.make_default_optimizer()

## Training

In [None]:
def apply_model(batch: Dict[str, Tensor]) -> Tensor:
    if isinstance(model, (MLP, ResNet)):
        x_cat_ohe = (
            [
                F.one_hot(column, cardinality)
                for column, cardinality in zip(batch["x_cat"].T, cat_cardinalities)
            ]
            if "x_cat" in batch
            else []
        )
        return model(torch.column_stack([batch["x_cont"]] + x_cat_ohe)).squeeze(-1)

    elif isinstance(model, FTTransformer):
        return model(None, batch.get("x_cat")).squeeze(-1)

    else:
        raise RuntimeError(f"Unknown model type: {type(model)}")


loss_fn = (
    F.binary_cross_entropy_with_logits
    if task_type == "binclass"
    else F.cross_entropy
    if task_type == "multiclass"
    else F.mse_loss
)


@torch.no_grad()
def evaluate(part: str, ret = False) -> float:
    model.eval()

    eval_batch_size = 8096
    y_pred = (
        torch.cat(
            [
                apply_model(batch)
                for batch in delu.iter_batches(data[part], eval_batch_size)
            ]
        )
        .cpu()
        .numpy()
    )
    y_true = data[part]["y"].cpu().numpy()

    if task_type == "binclass":
        y_pred = np.round(scipy.special.expit(y_pred))
        score = sklearn.metrics.accuracy_score(y_true, y_pred)
    elif task_type == "multiclass":
        y_pred = y_pred.argmax(1)
        score = sklearn.metrics.accuracy_score(y_true, y_pred)
    else:
        assert task_type == "regression"
        score = -(sklearn.metrics.mean_squared_error(y_true, y_pred) ** 0.5 * Y_std)
    if ret:
        return score, y_pred
    return score  # The higher -- the better.


print(f'Val score before training: {evaluate("val"):.4f}')

In [None]:
# For demonstration purposes (fast training and bad performance),
# one can set smaller values:
# n_epochs = 20
# patience = 2
n_epochs = 1000
patience = 16

batch_size = 256
epoch_size = math.ceil(len(train_idx) / batch_size)
timer = delu.tools.Timer()
early_stopping = delu.tools.EarlyStopping(patience, mode="max")
best = {
    "val": -math.inf,
    "test": -math.inf,
    "epoch": -1,
}

print(f"Device: {device.type.upper()}")
print("-" * 88 + "\n")
timer.run()
for epoch in range(n_epochs):
    for batch in tqdm(
        delu.iter_batches(data["train"], batch_size, shuffle=True),
        desc=f"Epoch {epoch}",
        total=epoch_size,
    ):
        model.train()
        optimizer.zero_grad()
        loss = loss_fn(apply_model(batch), batch["y"])
        loss.backward()
        optimizer.step()

    val_score = evaluate("val")
    #print(f"(val) {val_score:.4f} [time] {timer}")

    early_stopping.update(val_score)
    if early_stopping.should_stop():
        break

    if val_score > best["val"]:
        #print("🌸 New best epoch! 🌸")
        best = {"val": val_score, "epoch": epoch}
        torch.save(model.state_dict(), f'models/ftt_bit_depth3')
    print()

print("\n\nResult:")
print(best)

In [None]:
evaluate('val'), evaluate('test')

In [None]:
pkg = {}
for i in ['train', 'val', 'test']:
    pkg[i] = {'y_hat':evaluate(i, ret = True)[1].tolist(), 'y_true':data_numpy[i]['y'].tolist()}
    
pkg['Y_stats'] = {'mean':Y_mean, 'std':Y_std}
with open(f'predictions/ftt_bit_depth3_preds.json', 'w') as fp:
    json.dump(pkg, fp)