In [None]:
import sys
import os

# Determine the absolute path to the src directory
src_path = os.path.abspath(os.path.join(os.getcwd(), 'src'))

# Add src_path to sys.path if it's not already present
if src_path not in sys.path:
    sys.path.insert(0, src_path)

In [None]:
import typing
import torch
from typing import Iterator
import re
import numpy as np
import pandas as pd

import os
import random

from tabpfn_extensions import TabPFNRegressor, TabPFNClassifier
from tabpfn_extensions.embedding import TabPFNEmbedding
from tabpfn.constants import (
    NA_PLACEHOLDER,
)

# import to figure out what is going on with the data
from tabpfn.utils import validate_X_predict, _fix_dtypes, _process_text_na_dataframe

from tabpfn.config import ModelInterfaceConfig
from tabpfn.preprocessing import EnsembleConfig

from src.data_constants import *
from src.data_utils import load_data_for_tabPFN


In [None]:
# Set the random seed for reproducibility
s = 42
np.random.seed(s)
random.seed(s)

# load the dataset
f = 'data/processed/vitals_train_data.csv'
if f.endswith(".csv"):
    data, ids = load_data_for_tabPFN(f)
    print(f"Loaded data from {f}")

In [None]:
config = ModelInterfaceConfig(
    FEATURE_SHIFT_METHOD = None,
    CLASS_SHIFT_METHOD = None,
    FINGERPRINT_FEATURE = False,
)

In [None]:
categorical_features_ = [data.columns.get_loc(col) for col in categorical_features if col in data.columns]

In [None]:
categorical_features_

In [None]:
reg = TabPFNRegressor(random_state=s,  
                      categorical_features_indices=categorical_features_,
                      inference_config=config,
                      n_estimators = 1
                     )

In [None]:
reg.fit(data, np.arange(data.shape[0]))

In [None]:
x = data

In [None]:
x.dtypes

In [None]:
X = validate_X_predict(x, reg)

In [None]:
X = _fix_dtypes(X, cat_indices=reg.categorical_features_indices)

In [None]:
X = _process_text_na_dataframe(X, ord_encoder=reg.preprocessor_)

In [None]:
X.shape

In [None]:
print(x.columns[x.isna().all(axis=0)].values[0])

In [None]:
reg.executor_.ensemble_configs

In [None]:
pipeline = reg.executor_.preprocessors[0]

In [None]:
pipeline

In [None]:
pipeline[1].get_column_types(X)

In [None]:
pipeline[1].get_adaptive_preprocessors()["adaptive"]

In [None]:
pipeline[1].get_all_preprocessors(100)["safepower"]

In [None]:
pipeline[1].subsample_features

In [None]:
pipeline[1].apply_to_categorical

In [None]:
pipeline[2].categorical_transformer_

In [None]:
pd.DataFrame(X)

In [None]:
inspecting = X.copy()

In [None]:
pd.DataFrame(pipeline[0].transform(inspecting).X)

In [None]:
pd.DataFrame(pipeline[1].transform(inspecting).X)

In [None]:
pd.DataFrame(pipeline[2].transform(inspecting).X)

In [None]:
pd.DataFrame(reg.executor_.preprocessors[0].transform(X).X)

In [None]:
print(dir(pipeline[3]))

In [None]:
print(pipeline[3].shuffle_method)

In [None]:
shuffle_idx = [idx for i in [col for _, _, col in reg.preprocessor_.transformers_] for idx in i]

In [None]:
sort_idx = np.argsort(shuffle_idx)

In [None]:
x_in = tran[:, sort_idx]

In [None]:
pd.DataFrame(x_in)

In [None]:
data

In [None]:
x

In [None]:
pd.DataFrame(tran)