In [1]:
#pip install seaborn
import warnings
warnings.filterwarnings('ignore')

from pytorch_tabular import TabularModel
from pytorch_tabular.models import GatedAdditiveTreeEnsembleConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    #ExperimentConfig,
)
from pytorch_tabular.utils import get_class_weighted_cross_entropy
#pip install pytorch_tabular[extra]
from evaluation.generalevaluator import *
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, fetch_california_housing, load_breast_cancer
from factory import create_data_loader
import pandas as pd

import torch.nn as nn
import wandb

In [2]:
help(TrainerConfig)

Help on class TrainerConfig in module pytorch_tabular.config.config:

class TrainerConfig(builtins.object)
 |  TrainerConfig(batch_size: int = 64, data_aware_init_batch_size: int = 2000, fast_dev_run: bool = False, max_epochs: int = 10, min_epochs: Optional[int] = 1, max_time: Optional[int] = None, gpus: Optional[int] = None, accelerator: Optional[str] = 'auto', devices: Optional[int] = None, devices_list: Optional[List[int]] = None, accumulate_grad_batches: int = 1, auto_lr_find: bool = False, auto_select_gpus: bool = True, check_val_every_n_epoch: int = 1, gradient_clip_val: float = 0.0, overfit_batches: float = 0.0, deterministic: bool = False, profiler: Optional[str] = None, early_stopping: Optional[str] = 'valid_loss', early_stopping_min_delta: float = 0.001, early_stopping_mode: str = 'min', early_stopping_patience: int = 3, early_stopping_kwargs: Optional[Dict[str, Any]] = <factory>, checkpoints: Optional[str] = 'valid_loss', checkpoints_path: str = 'saved_models', checkpoints_e

In [2]:
data_loader = create_data_loader('iris', test_size=0.2, normalize_features = "mean_std", return_extra_info = True)
X_train, X_val, y_train, y_val, extra_info = data_loader.load_data()

In [4]:
X_train.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
22,-1.473937,1.203658,-1.562535,-1.187793
15,-0.133071,2.992376,-1.276006,-1.187793
65,1.085898,0.085709,0.385858,0.627942


# GATE Tests

In [None]:
help(GatedAdditiveTreeEnsembleConfig)

In [None]:
help(OptimizerConfig)

In [11]:
import torch.optim as optim
data_config = DataConfig(
    target=['target'],
    continuous_cols= [i for i in extra_info["num_col_names"] if i != "target"],
    categorical_cols=extra_info["cat_col_names"],
    #num_workers = 4
)
trainer_config = TrainerConfig(
    auto_lr_find=False, # Runs the LRFinder to automatically derive a learning rate
    batch_size=16,
    max_epochs=300,
    early_stopping="valid_loss", # Monitor valid_loss for early stopping
    early_stopping_mode = "min", # Set the mode as min because for val_loss, lower is better
    early_stopping_patience=20, # No. of epochs of degradation training will wait before terminating
    checkpoints="valid_loss", # Save best checkpoint monitoring val_loss
    load_best=True, # After training, load the best checkpoint
)

optimizer_config = OptimizerConfig(
    optimizer="Adam",
    optimizer_params={
        "weight_decay": 0.001
    },
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={
        "mode": "min",
        "factor": 0.1,
        "patience": 5,
        "verbose": True
    },
    lr_scheduler_monitor_metric="valid_loss"
)

model_config = GatedAdditiveTreeEnsembleConfig(
    task="classification",
    tree_depth  =  5,
    num_trees   =  12,
    chain_trees = False, # akin to bagging, True is akin to boosting
    gflu_stages =  2
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

2023-07-13 18:05:16,439 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off


In [16]:
from sklearn.utils.class_weight import compute_class_weight

In [21]:
np.unique(y_train.values)

array([0, 1, 2])

In [22]:
help(compute_class_weight)

Help on function compute_class_weight in module sklearn.utils.class_weight:

compute_class_weight(class_weight, *, classes, y)
    Estimate class weights for unbalanced datasets.
    
    Parameters
    ----------
    class_weight : dict, 'balanced' or None
        If 'balanced', class weights will be given by
        ``n_samples / (n_classes * np.bincount(y))``.
        If a dictionary is given, keys are classes and values
        are corresponding class weights.
        If None is given, the class weights will be uniform.
    
    classes : ndarray
        Array of the classes occurring in the data, as given by
        ``np.unique(y_org)`` with ``y_org`` the original class labels.
    
    y : array-like of shape (n_samples,)
        Array of original class labels per sample.
    
    Returns
    -------
    class_weight_vect : ndarray of shape (n_classes,)
        Array with class_weight_vect[i] the weight for i-th class.
    
    References
    ----------
    The "balanced" heurist

In [20]:

class_weights= compute_class_weight('balanced',np.unique(y_train.values),y_train.values)
class_weights=torch.tensor(class_weights,dtype=torch.float)
 
print(class_weights) #([1.0000, 1.0000, 4.0000, 1.0000, 0.5714])
#Then pass it to nn.CrossEntropyLoss's weight variable

weighted_loss = nn.CrossEntropyLoss(weight=class_weights,reduction='mean')

TypeError: compute_class_weight() takes 1 positional argument but 3 were given

In [12]:
# Calculate the weighted loss
weighted_loss = get_class_weighted_cross_entropy(y_train.values.ravel(), mu =1.4)



In [13]:
assert X_train.isnull().sum().sum() == 0
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)

# Merge X_train and y_train  
train = pd.concat([X_train, y_train], axis=1)

# Merge X_val and y_val
validation = pd.concat([X_val, y_val], axis=1)

tabular_model.fit(
    train=train, 
    validation=validation,
    loss=weighted_loss
)

2023-07-13 18:05:17,886 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off
Global seed set to 42
2023-07-13 18:05:17,913 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
2023-07-13 18:05:17,916 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for classification task
2023-07-13 18:05:17,941 - {pytorch_tabular.tabular_model:508} - INFO - Preparing the Model: GatedAdditiveTreeEnsembleModel
2023-07-13 18:05:18,073 - {pytorch_tabular.tabular_model:264} - INFO - Preparing the Trainer
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-07-13 18:05:18,142 - {pytorch_tabular.tabular_model:566} - INFO - Training Started


Output()

2023-07-13 18:13:07,327 - {pytorch_tabular.tabular_model:568} - INFO - Training the model completed
2023-07-13 18:13:07,328 - {pytorch_tabular.tabular_model:1207} - INFO - Loading the best model


<pytorch_lightning.trainer.trainer.Trainer at 0x15d18c850>

In [14]:
y_true = validation["target"]
y_pred = tabular_model.predict(validation)["prediction"]
evaluator = Evaluator(
                y_true=y_true,
                y_pred=y_pred,
                run_metrics=["mse", "f1", "accuracy"],
                metric="mse",
                problem_type="multiclass_classification",
            )
output_metrics = evaluator.evaluate_model()

Output()

In [15]:
output_metrics

{'mse': 0.0, 'accuracy': 1.0, 'f1': 1.0}