In [None]:
import sys
import os

# Determine the absolute path to the src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), 'src'))

# Add src_path to sys.path if it's not already present
if src_path not in sys.path:
    sys.path.insert(0, src_path)

In [None]:
import torch
from typing import Iterator

In [None]:
import pandas as pd
from tabpfn_extensions import TabPFNRegressor, TabPFNClassifier
from tabpfn_extensions.embedding import TabPFNEmbedding

import argparse
import os
import numpy as np
import random
from tqdm import tqdm

from src.data_utils import load_data_for_tabPFN
from src.data_constants import targets

# Set the random seed for reproducibility
s = 42
np.random.seed(s)
random.seed(s)

# load the dataset
f = 'data/processed/vitals_yeo-johnson_test_data.csv'
if f.endswith(".csv"):
    data, ids = load_data_for_tabPFN(f)
    print(f"Loaded data from {f}")

# fit the model and extract the embeddings
y = data[targets]
x = data.drop(columns=targets)
vecs = []
print(x.columns)
t = targets[-1]


In [None]:
from tabpfn.config import ModelInterfaceConfig
from tabpfn.preprocessing import EnsembleConfig

In [None]:
config = ModelInterfaceConfig(
    FEATURE_SHIFT_METHOD = None,
    CLASS_SHIFT_METHOD = None
)

In [None]:
reg = TabPFNClassifier(random_state=s,  
                      categorical_features_indices=[0,2,4,6,7,8],
                      inference_config=config,
                      n_estimators = 1
                     )
reg.feature_names_in_= x.columns

In [None]:
from tabpfn.utils import validate_X_predict, _fix_dtypes, _process_text_na_dataframe

In [None]:
x.dtypes

In [None]:
reg.fit(x, y[t])

In [None]:
x = validate_X_predict(x, reg)

In [None]:
x = _fix_dtypes(x, cat_indices=reg.categorical_features_indices)

In [None]:
X = _process_text_na_dataframe(x, ord_encoder=reg.preprocessor_)

In [None]:
X.shape

In [None]:
from sklearn.base import check_is_fitted, is_classifier
print(check_is_fitted(reg))

In [None]:
def custom_iter_outputs(
        executor,
        X: np.ndarray,
        *,
        device: torch.device,
        autocast: bool,
        only_return_standard_out: bool = True,
    ) -> Iterator[tuple[torch.Tensor | dict, EnsembleConfig]]:
        for preprocessor, X_train, y_train, config, cat_ix in zip(
            executor.preprocessors,
            executor.X_trains,
            executor.y_trains,
            executor.ensemble_configs,
            executor.cat_ixs,
        ):
            X_test = preprocessor.transform(X).X
            X_test = torch.as_tensor(X_test, dtype=torch.float32, device=device)
            # X_test = X_test.unsqueeze(1)

            yield X_test, config

In [None]:
for output, config in custom_iter_outputs(
            reg.executor_,
            X,
            device=reg.device_,
            autocast=reg.use_autocast_,
        ):
    print(output, config)
    print(output.shape)

In [None]:
print(x)

In [None]:
print(pd.DataFrame(output))

In [None]:
tran = reg.executor_.preprocessor.transform(X)

In [None]:
pd.DataFrame(reg.executor_.preprocessors[0].transform(X).X)

In [None]:
reg.executor_.ensemble_configs

In [None]:
for name, transformer, columns in reg.preprocessor_.transformers_:
    print(f"Transformer: {name}")
    print(f"Applies to columns: {columns}")
    print(f"Transformer steps: {transformer}")
    print("-" * 40)

In [None]:
shuffle_idx = [idx for i in [col for _, _, col in reg.preprocessor_.transformers_] for idx in i]

In [None]:
sort_idx = np.argsort(shuffle_idx)

In [None]:
x_in = tran[:, sort_idx]

In [None]:
pd.DataFrame(x_in)

In [None]:
data

In [None]:
x

In [None]:
pd.DataFrame(tran)