In [100]:
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from evaluation.generalevaluator import *
from factory import create_data_loader
import pandas as pd
from tqdm import tqdm
#pip install tabpfn
from tabpfn import TabPFNClassifier

In [113]:
data_loader = create_data_loader('ageconditions', test_size=0.01, normalize_features = "false", return_extra_info = True, encode_categorical = True)
X_train, X_test, y_train, y_test, extra_info = data_loader.load_data()

0 Index([], dtype='object')


In [114]:
df = pd.read_csv(
            "./data/kaggle/icr-identify-age-related-conditions/test.csv"
        )
ids = df["Id"]
df = df.drop(columns=["Id"])
df = df.rename(columns={"Class": "target"})
# map the values to 0 and 1
df["EJ"] = df["EJ"].map({"A": 1, "B": 0})

# find columns with NaN values
cols_with_nans = df.columns[df.isna().any()].tolist()


In [115]:
df.head()

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
# N_ensemble_configurations controls the number of model predictions that are ensembled with feature and class rotations (See our work for details).
# When N_ensemble_configurations > #features * #classes, no further averaging is applied.

classifier = TabPFNClassifier(device='cpu', N_ensemble_configurations=32)


Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [117]:
classifier.fit(X_train, y_train)
y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)

In [118]:
print('Accuracy', accuracy_score(y_test, y_eval))
print('F1', f1_score(y_test, y_eval))
print('AUC', roc_auc_score(y_test, 1-p_eval))

Accuracy 1.0
F1 1.0
AUC 1.0


In [119]:
y_eval, p_eval = classifier.predict(df, return_winning_probability=True)

In [120]:
p_eval

array([0.6682701, 0.6682701, 0.6682701, 0.6682701, 0.6682701],
      dtype=float32)

In [121]:
subs = pd.concat([ids,pd.DataFrame(p_eval,   columns= ["Class_0"])], axis = 1)

In [122]:
subs["Class_1"] = 1-subs["Class_0"]

In [123]:
subs

Unnamed: 0,Id,Class_0,Class_1
0,00eed32682bb,0.66827,0.33173
1,010ebe33f668,0.66827,0.33173
2,02fa521e1838,0.66827,0.33173
3,040e15f562a2,0.66827,0.33173
4,046e85c7cc7f,0.66827,0.33173


In [124]:
subs.to_csv("./output/ageconditions_submission.csv", index = False)