In [25]:
import torch
from torch.nn import Linear, BatchNorm1d, ReLU, GELU, Sigmoid, SiLU, LeakyReLU
import numpy as np
from pytorch_tabnet import sparsemax
# from google.colab import drive
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import r2_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random
# from pytorchtools import EarlyStopping
from entmax import entmax_bisect
from functools import partial



In [26]:
data = pd.read_excel('cleaned_data.xlsx')
print(data.head())

   Unnamed: 0  backers_count  \
0           1            403   
1           2              2   
2           3            406   
3           4              1   
4           5             67   

                                               blurb  \
0  The true biography of the historical figure, w...   
1  FAM is the new mobile app which combines event...   
2  A graphic novel about two magical ladies in love.   
3  We are publishing a magazine that focuses on t...   
4  A dark and magical film set in a brothel, an u...   

   converted_pledged_amount country country_displayable_name  \
0                     14740      US        the United States   
1                        14      GB       the United Kingdom   
2                     21799      US        the United States   
3                        10      US        the United States   
4                      8175      AU                Australia   

           created_at currency current_currency            deadline  ...  \
0 2015-08

In [27]:
def initialize_non_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(4 * input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    # torch.nn.init.zeros_(module.bias)
    return

In [28]:
def initialize_glu(module, input_dim, output_dim):
    gain_value = np.sqrt((input_dim + output_dim) / np.sqrt(input_dim))
    torch.nn.init.xavier_normal_(module.weight, gain=gain_value)
    # torch.nn.init.zeros_(module.bias)
    return

In [29]:
class GBN(torch.nn.Module):
    """
    Ghost Batch Normalization
    https://arxiv.org/abs/1705.08741
    """

    def __init__(self, input_dim, virtual_batch_size=128, momentum=0.01):
        super(GBN, self).__init__()

        self.input_dim = input_dim
        self.virtual_batch_size = virtual_batch_size
        self.bn = BatchNorm1d(self.input_dim, momentum=momentum)

    def forward(self, x):
        chunks = x.chunk(int(np.ceil(x.shape[0] / self.virtual_batch_size)), 0)
        res = [self.bn(x_) for x_ in chunks]

        return torch.cat(res, dim=0)

In [30]:
class TabNetEncoder(torch.nn.Module):
    def __init__(
        self,
        input_dim,
        output_dim,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2,
        epsilon=1e-15,
        virtual_batch_size=128,
        momentum=0.02,
        mask_type="sparsemax",
        group_attention_matrix=None,
    ):
            super(TabNetEncoder, self).__init__()
            self.input_dim = input_dim
            self.output_dim = output_dim
            self.is_multi_task = isinstance(output_dim, list)
            self.n_d = n_d
            self.n_a = n_a
            self.n_steps = n_steps
            self.gamma = gamma
            self.epsilon = epsilon
            self.n_independent = n_independent
            self.n_shared = n_shared
            self.virtual_batch_size = virtual_batch_size
            self.mask_type = mask_type
            self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01)
            self.group_attention_matrix = group_attention_matrix

            if self.group_attention_matrix is None:
                # no groups
                self.group_attention_matrix = torch.eye(self.input_dim)
                self.attention_dim = self.input_dim
            else:
                self.attention_dim = self.group_attention_matrix.shape[0]

            if self.n_shared > 0:
                shared_feat_transform = torch.nn.ModuleList()
                for i in range(self.n_shared):
                    if i == 0:
                        shared_feat_transform.append(
                            Linear(self.input_dim, 2 * (n_d + n_a), bias=False)
                        )
                    else:
                        shared_feat_transform.append(
                            Linear(n_d + n_a, 2 * (n_d + n_a), bias=False)
                        )

            else:
                shared_feat_transform = None

            self.initial_splitter = FeatTransformer(
                self.input_dim,
                n_d + n_a,
                shared_feat_transform,
                n_glu_independent=self.n_independent,
                virtual_batch_size=self.virtual_batch_size,
                momentum=momentum,
            )

            self.feat_transformers = torch.nn.ModuleList()
            self.att_transformers = torch.nn.ModuleList()

            for step in range(n_steps):
                transformer = FeatTransformer(
                    self.input_dim,
                    n_d + n_a,
                    shared_feat_transform,
                    n_glu_independent=self.n_independent,
                    virtual_batch_size=self.virtual_batch_size,
                    momentum=momentum,
                )
                attention = AttentiveTransformer(
                    n_a,
                    self.attention_dim,
                    group_matrix=group_attention_matrix,
                    virtual_batch_size=self.virtual_batch_size,
                    momentum=momentum,
                    mask_type=self.mask_type,
                )
                self.feat_transformers.append(transformer)
                self.att_transformers.append(attention)

    def forward(self, x, prior=None):
        x = self.initial_bn(x)

        bs = x.shape[0]  # batch size
        if prior is None:
            prior = torch.ones((bs, self.attention_dim)).to(x.device)

        M_loss = 0
        att = self.initial_splitter(x)[:, self.n_d :]
        steps_output = []
        for step in range(self.n_steps):
            M = self.att_transformers[step](prior, att)
            M_loss += torch.mean(
                torch.sum(torch.mul(M, torch.log(M + self.epsilon)), dim=1)
            )
            # update prior
            prior = torch.mul(self.gamma - M, prior)
            # output
            M_feature_level = torch.matmul(M, self.group_attention_matrix)
            masked_x = torch.mul(M_feature_level, x)
            out = self.feat_transformers[step](masked_x)
            d = ReLU()(out[:, : self.n_d])
            steps_output.append(d)
            # update attention
            att = out[:, self.n_d :]

        M_loss /= self.n_steps
        return steps_output, M_loss

    def forward_masks(self, x):
        x = self.initial_bn(x)
        bs = x.shape[0]  # batch size
        prior = torch.ones((bs, self.attention_dim)).to(x.device)
        M_explain = torch.zeros(x.shape).to(x.device)
        att = self.initial_splitter(x)[:, self.n_d :]
        masks = {}

        for step in range(self.n_steps):
            M = self.att_transformers[step](prior, att)
            M_feature_level = torch.matmul(M, self.group_attention_matrix)
            masks[step] = M_feature_level
            # update prior
            prior = torch.mul(self.gamma - M, prior)
            # output
            masked_x = torch.mul(M_feature_level, x)
            out = self.feat_transformers[step](masked_x)
            d = ReLU()(out[:, : self.n_d])
            # explain
            step_importance = torch.sum(d, dim=1)
            M_explain += torch.mul(M_feature_level, step_importance.unsqueeze(dim=1))
            # update attention
            att = out[:, self.n_d :]

        return M_explain, masks

In [31]:
# class TabNetDecoder(torch.nn.Module):
#     def __init__(
#         self,
#         input_dim,
#         n_d=8,
#         n_steps=3,
#         n_independent=1,
#         n_shared=1,
#         virtual_batch_size=128,
#         momentum=0.02,
#     ):
#             super(TabNetDecoder, self).__init__()
#             self.input_dim = input_dim
#             self.n_d = n_d
#             self.n_steps = n_steps
#             self.n_independent = n_independent
#             self.n_shared = n_shared
#             self.virtual_batch_size = virtual_batch_size

#             self.feat_transformers = torch.nn.ModuleList()

#             if self.n_shared > 0:
#                 shared_feat_transform = torch.nn.ModuleList()
#                 for i in range(self.n_shared):
#                     shared_feat_transform.append(Linear(n_d, 2 * n_d, bias=False))
#             else:
#                 shared_feat_transform = None

#             for step in range(n_steps):
#                 transformer = FeatTransformer(
#                     n_d,
#                     n_d,
#                     shared_feat_transform,
#                     n_glu_independent=self.n_independent,
#                     virtual_batch_size=self.virtual_batch_size,
#                     momentum=momentum,
#                 )
#                 self.feat_transformers.append(transformer)

#             self.reconstruction_layer = Linear(n_d, self.input_dim, bias=False)
#             initialize_non_glu(self.reconstruction_layer, n_d, self.input_dim)

#     def forward(self, steps_output):
#         res = 0
#         for step_nb, step_output in enumerate(steps_output):
#             x = self.feat_transformers[step_nb](step_output)
#             res = torch.add(res, x)
#         res = self.reconstruction_layer(res)
#         return res


In [32]:
# class TabNetPretraining(torch.nn.Module):
#     def __init__(
#         self,
#         input_dim,
#         pretraining_ratio=0.2,
#         n_d=8,
#         n_a=8,
#         n_steps=3,
#         gamma=1.3,
#         cat_idxs=[],
#         cat_dims=[],
#         cat_emb_dim=1,
#         n_independent=2,
#         n_shared=2,
#         epsilon=1e-15,
#         virtual_batch_size=128,
#         momentum=0.02,
#         mask_type="sparsemax",
#         n_shared_decoder=1,
#         n_indep_decoder=1,
#         group_attention_matrix=None,
#     ):
#         super(TabNetPretraining, self).__init__()

#         self.cat_idxs = cat_idxs or []
#         self.cat_dims = cat_dims or []
#         self.cat_emb_dim = cat_emb_dim

#         self.input_dim = input_dim
#         self.n_d = n_d
#         self.n_a = n_a
#         self.n_steps = n_steps
#         self.gamma = gamma
#         self.epsilon = epsilon
#         self.n_independent = n_independent
#         self.n_shared = n_shared
#         self.mask_type = mask_type
#         self.pretraining_ratio = pretraining_ratio
#         self.n_shared_decoder = n_shared_decoder
#         self.n_indep_decoder = n_indep_decoder

#         if self.n_steps <= 0:
#             raise ValueError("n_steps should be a positive integer.")
#         if self.n_independent == 0 and self.n_shared == 0:
#             raise ValueError("n_shared and n_independent can't be both zero.")

#         self.virtual_batch_size = virtual_batch_size
#         self.embedder = EmbeddingGenerator(input_dim,
#                                            cat_dims,
#                                            cat_idxs,
#                                            cat_emb_dim,
#                                            group_attention_matrix)
#         self.post_embed_dim = self.embedder.post_embed_dim

#         self.masker = RandomObfuscator(self.pretraining_ratio,
#                                        group_matrix=self.embedder.embedding_group_matrix)
#         self.encoder = TabNetEncoder(
#             input_dim=self.post_embed_dim,
#             output_dim=self.post_embed_dim,
#             n_d=n_d,
#             n_a=n_a,
#             n_steps=n_steps,
#             gamma=gamma,
#             n_independent=n_independent,
#             n_shared=n_shared,
#             epsilon=epsilon,
#             virtual_batch_size=virtual_batch_size,
#             momentum=momentum,
#             mask_type=mask_type,
#             group_attention_matrix=self.embedder.embedding_group_matrix,
#         )
#         self.decoder = TabNetDecoder(
#             self.post_embed_dim,
#             n_d=n_d,
#             n_steps=n_steps,
#             n_independent=self.n_indep_decoder,
#             n_shared=self.n_shared_decoder,
#             virtual_batch_size=virtual_batch_size,
#             momentum=momentum,
#         )

#     def forward(self, x):
#         """
#         Returns: res, embedded_x, obf_vars
#             res : output of reconstruction
#             embedded_x : embedded input
#             obf_vars : which variable where obfuscated
#         """
#         embedded_x = self.embedder(x)
#         if self.training:
#             masked_x, obfuscated_groups, obfuscated_vars = self.masker(embedded_x)
#             # set prior of encoder with obfuscated groups
#             prior = 1 - obfuscated_groups
#             steps_out, _ = self.encoder(masked_x, prior=prior)
#             res = self.decoder(steps_out)
#             return res, embedded_x, obfuscated_vars
#         else:
#             steps_out, _ = self.encoder(embedded_x)
#             res = self.decoder(steps_out)
#             return res, embedded_x, torch.ones(embedded_x.shape).to(x.device)

#     def forward_masks(self, x):
#         embedded_x = self.embedder(x)
#         return self.encoder.forward_masks(embedded_x)


In [33]:
class TabNetNoEmbeddings(torch.nn.Module):
    def __init__(
        self,
        input_dim,
        output_dim,
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.3,
        n_independent=2,
        n_shared=2,
        epsilon=1e-15,
        virtual_batch_size=128,
        momentum=0.02,
        mask_type="sparsemax",
        group_attention_matrix=None,
    ):
        """
        Defines main part of the TabNet network without the embedding layers.

        Parameters
        ----------
        input_dim : int
            Number of features
        output_dim : int or list of int for multi task classification
            Dimension of network output
            examples : one for regression, 2 for binary classification etc...
        n_d : int
            Dimension of the prediction  layer (usually between 4 and 64)
        n_a : int
            Dimension of the attention  layer (usually between 4 and 64)
        n_steps : int
            Number of successive steps in the network (usually between 3 and 10)
        gamma : float
            Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0)
        n_independent : int
            Number of independent GLU layer in each GLU block (default 2)
        n_shared : int
            Number of independent GLU layer in each GLU block (default 2)
        epsilon : float
            Avoid log(0), this should be kept very low
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization
        momentum : float
            Float value between 0 and 1 which will be used for momentum in all batch norm
        mask_type : str
            Either "sparsemax" or "entmax" : this is the masking function to use
        group_attention_matrix : torch matrix
            Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j
        """
        super(TabNetNoEmbeddings, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.is_multi_task = isinstance(output_dim, list)
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.virtual_batch_size = virtual_batch_size
        self.mask_type = mask_type
        self.initial_bn = BatchNorm1d(self.input_dim, momentum=0.01)

        self.encoder = TabNetEncoder(
            input_dim=input_dim,
            output_dim=output_dim,
            n_d=n_d,
            n_a=n_a,
            n_steps=n_steps,
            gamma=gamma,
            n_independent=n_independent,
            n_shared=n_shared,
            epsilon=epsilon,
            virtual_batch_size=virtual_batch_size,
            momentum=momentum,
            mask_type=mask_type,
            group_attention_matrix=group_attention_matrix
        )

        if self.is_multi_task:
            self.multi_task_mappings = torch.nn.ModuleList()
            for task_dim in output_dim:
                task_mapping = Linear(n_d, task_dim, bias=False)
                initialize_non_glu(task_mapping, n_d, task_dim)
                self.multi_task_mappings.append(task_mapping)
        else:
            self.final_mapping = Linear(n_d, output_dim, bias=False)
            initialize_non_glu(self.final_mapping, n_d, output_dim)

    def forward(self, x):
        res = 0
        steps_output, M_loss = self.encoder(x)
        res = torch.sum(torch.stack(steps_output, dim=0), dim=0)

        if self.is_multi_task:
            # Result will be in list format
            out = []
            for task_mapping in self.multi_task_mappings:
                out.append(task_mapping(res))
        else:
            out = self.final_mapping(res)
        return out, M_loss

    def forward_masks(self, x):
        return self.encoder.forward_masks(x)

In [34]:
class TabNet(torch.nn.Module):
    def __init__(
        self,
        input_dim,
        output_dim,
        n_d=8,
        n_a=8,
        n_steps=10,
        gamma=1.5,
        cat_idxs=[],
        cat_dims=[],
        cat_emb_dim=1,
        n_independent=2,
        n_shared=2,
        epsilon=1e-15,
        virtual_batch_size=64,
        momentum=0.02,
        mask_type="sparsemax",
        group_attention_matrix=[],
    ):
        """
        Defines TabNet network

        Parameters
        ----------
        input_dim : int
            Initial number of features
        output_dim : int
            Dimension of network output
            examples : one for regression, 2 for binary classification etc...
        n_d : int
            Dimension of the prediction  layer (usually between 4 and 64)
        n_a : int
            Dimension of the attention  layer (usually between 4 and 64)
        n_steps : int
            Number of successive steps in the network (usually between 3 and 10)
        gamma : float
            Float above 1, scaling factor for attention updates (usually between 1.0 to 2.0)
        cat_idxs : list of int
            Index of each categorical column in the dataset
        cat_dims : list of int
            Number of categories in each categorical column
        cat_emb_dim : int or list of int
            Size of the embedding of categorical features
            if int, all categorical features will have same embedding size
            if list of int, every corresponding feature will have specific size
        n_independent : int
            Number of independent GLU layer in each GLU block (default 2)
        n_shared : int
            Number of independent GLU layer in each GLU block (default 2)
        epsilon : float
            Avoid log(0), this should be kept very low
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization
        momentum : float
            Float value between 0 and 1 which will be used for momentum in all batch norm
        mask_type : str
            Either "sparsemax" or "entmax" : this is the masking function to use
        group_attention_matrix : torch matrix
            Matrix of size (n_groups, input_dim), m_ij = importance within group i of feature j
        """
        super(TabNet, self).__init__()
        self.cat_idxs = cat_idxs or []
        self.cat_dims = cat_dims or []
        self.cat_emb_dim = cat_emb_dim

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.n_d = n_d
        self.n_a = n_a
        self.n_steps = n_steps
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_independent = n_independent
        self.n_shared = n_shared
        self.mask_type = mask_type

        if self.n_steps <= 0:
            raise ValueError("n_steps should be a positive integer.")
        if self.n_independent == 0 and self.n_shared == 0:
            raise ValueError("n_shared and n_independent can't be both zero.")

        self.virtual_batch_size = virtual_batch_size
        self.embedder = EmbeddingGenerator(input_dim,
                                           cat_dims,
                                           cat_idxs,
                                           cat_emb_dim,
                                           group_attention_matrix)
        self.post_embed_dim = self.embedder.post_embed_dim

        self.tabnet = TabNetNoEmbeddings(
            self.post_embed_dim,
            output_dim,
            n_d,
            n_a,
            n_steps,
            gamma,
            n_independent,
            n_shared,
            epsilon,
            virtual_batch_size,
            momentum,
            mask_type,
            self.embedder.embedding_group_matrix
        )

    def forward(self, x):
        x = self.embedder(x)
        return self.tabnet(x)

    def forward_masks(self, x):
        x = self.embedder(x)
        return self.tabnet.forward_masks(x)

In [35]:
class AttentiveTransformer(torch.nn.Module):
    def __init__(
        self,
        input_dim,
        group_dim,
        group_matrix,
        virtual_batch_size=128,
        momentum=0.02,
        mask_type="sparsemax",
    ):
        """
        Initialize an attention transformer.

        Parameters
        ----------
        input_dim : int
            Input size
        group_dim : int
            Number of groups for features
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization
        momentum : float
            Float value between 0 and 1 which will be used for momentum in batch norm
        mask_type : str
            Either "sparsemax" or "entmax" : this is the masking function to use
        """
        super(AttentiveTransformer, self).__init__()
        self.fc = Linear(input_dim, group_dim, bias=False)
        initialize_non_glu(self.fc, input_dim, group_dim)
        self.bn = GBN(
            group_dim, virtual_batch_size=virtual_batch_size, momentum=momentum
        )

        if mask_type == "sparsemax":
            # Sparsemax
            self.selector = sparsemax.Sparsemax(dim=-1)
        elif mask_type == "entmax15":
            # Entmax
            self.selector = sparsemax.Entmax15(dim=-1)
            #new below
        elif mask_type == "entmoid15":
            # Entmoid15
            # self.selector = sparsemax.Entmoid15()
            self.selector = sparsemax.Entmoid15.apply
        elif mask_type == "alpha11entmax":
            self.selector = partial(entmax_bisect, alpha=1.1, dim=-1)
        elif mask_type == "alpha12entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.2)
            self.selector = partial(entmax_bisect, alpha=1.2, dim=-1)
        elif mask_type == "alpha13entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.3)
            self.selector = partial(entmax_bisect, alpha=1.3, dim=-1)
        elif mask_type == "alpha14entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.4)
            self.selector = partial(entmax_bisect, alpha=1.4, dim=-1)
        elif mask_type == "alpha15entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.5)
            self.selector = partial(entmax_bisect, alpha=1.5, dim=-1)
        elif mask_type == "alpha16entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.6)
            self.selector = partial(entmax_bisect, alpha=1.6, dim=-1)
        elif mask_type == "alpha17entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.7)
            self.selector = partial(entmax_bisect, alpha=1.7, dim=-1)
        elif mask_type == "alpha18entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.8)
            self.selector = partial(entmax_bisect, alpha=1.8, dim=-1)
        elif mask_type == "alpha19entmax":
            # self.selector = sparsemax.EntmaxBisect(dim=-1, alpha=1.9)
            self.selector = partial(entmax_bisect, alpha=1.9, dim=-1)
            #new up here
        else:
            raise NotImplementedError(
                "Please choose either sparsemax" + "or entmax15 as masktype"
            )

    def forward(self, priors, processed_feat):
        x = self.fc(processed_feat)
        x = self.bn(x)
        x = torch.mul(x, priors)
        x = self.selector(x)
        return x

In [36]:
class FeatTransformer(torch.nn.Module):
    def __init__(
        self,
        input_dim,
        output_dim,
        shared_layers,
        n_glu_independent,
        virtual_batch_size=128,
        momentum=0.02,
    ):
        super(FeatTransformer, self).__init__()
        """
        Initialize a feature transformer.

        Parameters
        ----------
        input_dim : int
            Input size
        output_dim : int
            Output_size
        shared_layers : torch.nn.ModuleList
            The shared block that should be common to every step
        n_glu_independent : int
            Number of independent GLU layers
        virtual_batch_size : int
            Batch size for Ghost Batch Normalization within GLU block(s)
        momentum : float
            Float value between 0 and 1 which will be used for momentum in batch norm
        """

        params = {
            "n_glu": n_glu_independent,
            "virtual_batch_size": virtual_batch_size,
            "momentum": momentum,
        }

        if shared_layers is None:
            # no shared layers
            self.shared = torch.nn.Identity()
            is_first = True
        else:
            self.shared = GLU_Block(
                input_dim,
                output_dim,
                first=True,
                shared_layers=shared_layers,
                n_glu=len(shared_layers),
                virtual_batch_size=virtual_batch_size,
                momentum=momentum,
            )
            is_first = False

        if n_glu_independent == 0:
            # no independent layers
            self.specifics = torch.nn.Identity()
        else:
            spec_input_dim = input_dim if is_first else output_dim
            self.specifics = GLU_Block(
                spec_input_dim, output_dim, first=is_first, **params
            )

    def forward(self, x):
        x = self.shared(x)
        x = self.specifics(x)
        return x

In [37]:
class GLU_Block(torch.nn.Module):
    """
    Independent GLU block, specific to each step
    """

    def __init__(
        self,
        input_dim,
        output_dim,
        n_glu=2,
        first=False,
        shared_layers=None,
        virtual_batch_size=128,
        momentum=0.02,
    ):
        super(GLU_Block, self).__init__()
        self.first = first
        self.shared_layers = shared_layers
        self.n_glu = n_glu
        self.glu_layers = torch.nn.ModuleList()

        params = {"virtual_batch_size": virtual_batch_size, "momentum": momentum}

        fc = shared_layers[0] if shared_layers else None
        self.glu_layers.append(GLU_Layer(input_dim, output_dim, fc=fc, **params))
        for glu_id in range(1, self.n_glu):
            fc = shared_layers[glu_id] if shared_layers else None
            self.glu_layers.append(GLU_Layer(output_dim, output_dim, fc=fc, **params))

    def forward(self, x):
        scale = torch.sqrt(torch.FloatTensor([0.5]).to(x.device))
        if self.first:  # the first layer of the block has no scale multiplication
            x = self.glu_layers[0](x)
            layers_left = range(1, self.n_glu)
        else:
            layers_left = range(self.n_glu)

        for glu_id in layers_left:
            x = torch.add(x, self.glu_layers[glu_id](x))
            x = x * scale
        return x

In [38]:
class GLU_Layer(torch.nn.Module):
    def __init__(
        self, input_dim, output_dim, fc=None, virtual_batch_size=128, momentum=0.02
    ):
        super(GLU_Layer, self).__init__()

        self.output_dim = output_dim
        if fc:
            self.fc = fc
        else:
            self.fc = Linear(input_dim, 2 * output_dim, bias=False)
        initialize_glu(self.fc, input_dim, 2 * output_dim)

        self.bn = GBN(
            2 * output_dim, virtual_batch_size=virtual_batch_size, momentum=momentum
        )

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        out = torch.mul(x[:, : self.output_dim], torch.sigmoid(x[:, self.output_dim :]))
        return out

In [39]:
class EmbeddingGenerator(torch.nn.Module):
    """
    Classical embeddings generator
    """

    def __init__(self, input_dim, cat_dims, cat_idxs, cat_emb_dims, group_matrix):
        """This is an embedding module for an entire set of features

        Parameters
        ----------
        input_dim : int
            Number of features coming as input (number of columns)
        cat_dims : list of int
            Number of modalities for each categorial features
            If the list is empty, no embeddings will be done
        cat_idxs : list of int
            Positional index for each categorical features in inputs
        cat_emb_dim : list of int
            Embedding dimension for each categorical features
            If int, the same embedding dimension will be used for all categorical features
        group_matrix : torch matrix
            Original group matrix before embeddings
        """
        super(EmbeddingGenerator, self).__init__()

        if cat_dims == [] and cat_idxs == []:
            self.skip_embedding = True
            self.post_embed_dim = input_dim
            # self.embedding_group_matrix = group_matrix.to(group_matrix.device)
            ###################################
            if group_matrix is not None:
                self.embedding_group_matrix = group_matrix.to(group_matrix.device)
            else:
                self.embedding_group_matrix = torch.eye(input_dim)
            ###################################
            return
        else:
            self.skip_embedding = False

        self.post_embed_dim = int(input_dim + np.sum(cat_emb_dims) - len(cat_emb_dims))

        self.embeddings = torch.nn.ModuleList()

        for cat_dim, emb_dim in zip(cat_dims, cat_emb_dims):
            self.embeddings.append(torch.nn.Embedding(cat_dim, emb_dim))

        # record continuous indices
        self.continuous_idx = torch.ones(input_dim, dtype=torch.bool)
        self.continuous_idx[cat_idxs] = 0

        # update group matrix
        n_groups = group_matrix.shape[0]
        self.embedding_group_matrix = torch.empty((n_groups, self.post_embed_dim),
                                                  device=group_matrix.device)
        for group_idx in range(n_groups):
            post_emb_idx = 0
            cat_feat_counter = 0
            for init_feat_idx in range(input_dim):
                if self.continuous_idx[init_feat_idx] == 1:
                    # this means that no embedding is applied to this column
                    self.embedding_group_matrix[group_idx, post_emb_idx] = group_matrix[group_idx, init_feat_idx]  # noqa
                    post_emb_idx += 1
                else:
                    # this is a categorical feature which creates multiple embeddings
                    n_embeddings = cat_emb_dims[cat_feat_counter]
                    self.embedding_group_matrix[group_idx, post_emb_idx:post_emb_idx+n_embeddings] = group_matrix[group_idx, init_feat_idx] / n_embeddings  # noqa
                    post_emb_idx += n_embeddings
                    cat_feat_counter += 1

    def forward(self, x):
        """
        Apply embeddings to inputs
        Inputs should be (batch_size, input_dim)
        Outputs will be of size (batch_size, self.post_embed_dim)
        """
        if self.skip_embedding:
            # no embeddings required
            return x

        cols = []
        cat_feat_counter = 0
        for feat_init_idx, is_continuous in enumerate(self.continuous_idx):
            # Enumerate through continuous idx boolean mask to apply embeddings
            if is_continuous:
                cols.append(x[:, feat_init_idx].float().view(-1, 1))
            else:
                cols.append(
                    self.embeddings[cat_feat_counter](x[:, feat_init_idx].long())
                )
                cat_feat_counter += 1
        # concat
        post_embeddings = torch.cat(cols, dim=1)
        return post_embeddings

In [40]:
# class RandomObfuscator(torch.nn.Module):
#     """
#     Create and applies obfuscation masks.
#     The obfuscation is done at group level to match attention.
#     """

#     def __init__(self, pretraining_ratio, group_matrix):
#         """
#         This create random obfuscation for self suppervised pretraining
#         Parameters
#         ----------
#         pretraining_ratio : float
#             Ratio of feature to randomly discard for reconstruction

#         """
#         super(RandomObfuscator, self).__init__()
#         self.pretraining_ratio = pretraining_ratio
#         # group matrix is set to boolean here to pass all posssible information
#         self.group_matrix = (group_matrix > 0) + 0.
#         self.num_groups = group_matrix.shape[0]

#     def forward(self, x):
#         """
#         Generate random obfuscation mask.

#         Returns
#         -------
#         masked input and obfuscated variables.
#         """
#         bs = x.shape[0]

#         obfuscated_groups = torch.bernoulli(
#             self.pretraining_ratio * torch.ones((bs, self.num_groups), device=x.device)
#         )
#         obfuscated_vars = torch.matmul(obfuscated_groups, self.group_matrix)
#         masked_input = torch.mul(1 - obfuscated_vars, x)
#         return masked_input, obfuscated_groups, obfuscated_vars

In [41]:
 # Define categorical and numerical features
categorical = ["category_name", "category_slug", "country", "currency","spotlight","staff_pick"]
numerical = ['backers_count', 'converted_pledged_amount', 'fx_rate', 'goal', 'pledged', 'static_usd_rate', 'usd_pledged','days_diff_created_at_deadline','days_diff_state_changed_at_launched_at','name_len', 'blurb_len']
# valid_states = ['failed', 'successful']
# state_mapping = {state: idx for idx, state in enumerate(valid_states)}
# data["state"] = data["state"].map(state_mapping)

SEED = 42
# np.random.seed(SEED)
# torch.manual_seed(SEED)
# random.seed(SEED)

# Label encode categorical variables
label_encoders = {}
for col in categorical:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoders for later decoding

# Define targets
target_class = "state"  # Classification target
target_reg = "success_probability"  # Regression target (continuous value like pledged amount)

# Apply StandardScaler to numerical features
scaler = StandardScaler()
data[numerical] = scaler.fit_transform(data[numerical])

# Normalize the regression target values using MinMaxScaler
reg_scaler = StandardScaler()
data[target_reg] = reg_scaler.fit_transform(data[[target_reg]])


# Split data into train/test sets
train_df, test_df = train_test_split(data, test_size=0.2, random_state=SEED)

# In this example, we use all of the features (both categorical and numerical).
feature_cols = categorical + numerical

# Convert the features and target columns into tensors.
X_train = torch.tensor(train_df[feature_cols].values, dtype=torch.float32)
X_test = torch.tensor(test_df[feature_cols].values, dtype=torch.float32)

# For classification, the target is a long (integer) type.
y_train_class = torch.tensor(train_df[target_class].values, dtype=torch.long)
y_test_class = torch.tensor(test_df[target_class].values, dtype=torch.long)

# For regression, we convert the target to float.
# Even though the values are 0 or 1, treating them as floats allows you to interpret the regression output as a rate.
y_train_reg = torch.tensor(train_df[target_reg].values, dtype=torch.float32)
y_test_reg = torch.tensor(test_df[target_reg].values, dtype=torch.float32)


In [42]:
# print(train_df[target_reg].mean(), train_df[target_reg].std())
# print(test_df[target_reg].mean(), test_df[target_reg].std()) 
# print(label_encoders["category"].classes_)
# print(data["category"].unique())  # ควรมีจำนวนเท่ากัน


In [43]:
# from prettytable import PrettyTable
# columns_to_keep = categorical + numerical + [target_reg]+ [target_class]

# # Drop columns not in columns_to_keep
# data_keep = data[columns_to_keep]
# # Create a pretty table
# pt = PrettyTable()
# pt.field_names = data_keep.columns  # Set column headers

# # Add rows to the table
# for row in data_keep.head(10).values:
#     pt.add_row(row)

# # Print the table
# print(pt)

In [44]:
class MultiTaskTabNet(nn.Module):
    def __init__(self, input_dim, n_classes, group_attention_matrix=None):
        super(MultiTaskTabNet, self).__init__()

        self.tabnet = TabNet(
            input_dim=input_dim,
            output_dim=64,          # ลดขนาด latent feature
            n_d=4,
            n_a=4,
            n_steps=2,
            gamma=1.5,
            n_independent=2,
            n_shared=2,
            virtual_batch_size=64,
            momentum=0.02,
            mask_type="entmax15",
            group_attention_matrix=group_attention_matrix
        )

        # Classification head
        self.classification_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.LayerNorm(32),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, n_classes)
        )

        # Regression head
        self.regression_head = nn.Sequential(
            nn.Linear(64, 32),
            nn.LayerNorm(32),
            nn.LeakyReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        shared_features, _ = self.tabnet(x)
        class_output = self.classification_head(shared_features)
        reg_output = self.regression_head(shared_features)
        return class_output, reg_output


In [45]:
# import optuna

# import numpy as np
# from sklearn.metrics import accuracy_score
# from torch.utils.data import DataLoader, TensorDataset
# import torch

# X_train = torch.tensor(X_train, dtype=torch.float32)
# y_train_class = torch.tensor(y_train_class, dtype=torch.long)

# train_data = TensorDataset(X_train, y_train_class)
# train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

# def objective(trial):
#     output_dim = trial.suggest_categorical("output_dim", [128, 256, 512])
#     n_d = trial.suggest_int("n_d", 32, 128, step=8)
#     n_a = trial.suggest_int("n_a", 32, 128, step=8)
#     n_steps = trial.suggest_int("n_steps", 5, 7)
#     gamma = trial.suggest_float("gamma", 1.0, 2.0, step=0.1)
#     lr = trial.suggest_loguniform("lr", 1e-5, 1e-3)
#     mask_type = trial.suggest_categorical("mask_type", ["sparsemax", "entmax15", "alpha11entmax"])
    
#     class_hidden_size = trial.suggest_int("class_hidden_size", 96, 256, step=32)
#     class_dropout_rate = trial.suggest_float("class_dropout_rate", 0.3, 0.7)
    
#     reg_hidden_size = trial.suggest_int("reg_hidden_size", 48, 128, step=16)
#     reg_dropout_rate = trial.suggest_float("reg_dropout_rate", 0.3, 0.9)
    
#     model = MultiTaskTabNet(input_dim=X_train.shape[1], n_classes=len(valid_states))
    
#     model.classification_head = nn.Sequential(
#         nn.Linear(128, class_hidden_size),
#         nn.BatchNorm1d(class_hidden_size),
#         nn.LeakyReLU(),
#         nn.Dropout(class_dropout_rate),

#         nn.Linear(class_hidden_size, class_hidden_size // 2),
#         nn.BatchNorm1d(class_hidden_size // 2),
#         nn.LeakyReLU(),
#         nn.Dropout(class_dropout_rate),

#         nn.Linear(class_hidden_size // 2, class_hidden_size // 4),
#         nn.BatchNorm1d(class_hidden_size // 4),
#         nn.LeakyReLU(),
#         nn.Dropout(class_dropout_rate),

#         nn.Linear(class_hidden_size // 4, class_hidden_size // 8),
#         nn.LeakyReLU(),
#         nn.Dropout(class_dropout_rate),

#         nn.Linear(class_hidden_size // 8, len(valid_states))
#     )
    
#     model.regression_head = nn.Sequential(
#         nn.Linear(128, reg_hidden_size),
#         nn.BatchNorm1d(reg_hidden_size),
#         nn.LeakyReLU(),
#         nn.Dropout(reg_dropout_rate),

#         nn.Linear(reg_hidden_size, reg_hidden_size // 2),
#         nn.BatchNorm1d(reg_hidden_size // 2),
#         nn.LeakyReLU(),
#         nn.Dropout(reg_dropout_rate),

#         nn.Linear(reg_hidden_size // 2, reg_hidden_size // 4),
#         nn.BatchNorm1d(reg_hidden_size // 4),
#         nn.LeakyReLU(),
#         nn.Dropout(reg_dropout_rate),

#         nn.Linear(reg_hidden_size // 4, 1)
#     )
    
#     optimizer = torch.optim.Adam(model.parameters(), lr=lr)
#     loss_class = nn.CrossEntropyLoss()
    
#     # เทรนโมเดลแบบสั้น ๆ (10 Epoch)
#     for epoch in range(25):
#         model.train()
#         for X_batch, y_batch in train_loader:
#             optimizer.zero_grad()
#             class_pred, _ = model(X_batch)
#             loss = loss_class(class_pred, y_batch)
#             loss.backward()
#             optimizer.step()
    
#     # ทดสอบโมเดล
#     model.eval()
#     with torch.no_grad():
#         class_pred_test, _ = model(X_test)
#         test_pred = torch.argmax(class_pred_test, dim=1)
#         accuracy = accuracy_score(y_test_class.numpy(), test_pred.numpy())
    
#     return accuracy  # คืนค่าความแม่นยำไปให้ Optuna เลือก

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=200)  

# best_params = study.best_params
# print("Best Hyperparameters:", best_params)

# final_model = MultiTaskTabNet(
#     input_dim=X_train.shape[1],
#     n_classes=len(valid_states),
#     # output_dim=best_params["output_dim"],
#     n_d=best_params["n_d"],
#     n_a=best_params["n_a"],
#     n_steps=best_params["n_steps"],
#     gamma=best_params["gamma"],
#     mask_type=best_params["mask_type"],
# )


# final_optimizer = torch.optim.Adam(final_model.parameters(), lr=best_params["lr"])
# loss_class = nn.CrossEntropyLoss()  # เพิ่ม loss function ที่หายไป

# for epoch in range(50):  
#     final_model.train()
#     for X_batch, y_batch in train_loader:
#         final_optimizer.zero_grad()
#         class_pred, _ = final_model(X_batch)
#         loss = loss_class(class_pred, y_batch)
#         loss.backward()
#         final_optimizer.step()

#     with torch.no_grad():
#         class_pred_test, _ = final_model(X_test)
#         test_pred = torch.argmax(class_pred_test, dim=1)
#         accuracy = accuracy_score(y_test_class.numpy(), test_pred.numpy())
    
#     print(f"Epoch {epoch+1}/50 - Test Accuracy: {accuracy:.4f}")


In [46]:
# import torch
# import torch.nn as nn
# import numpy as np
# from sklearn.metrics import r2_score
# from torch.utils.data import DataLoader, TensorDataset

# # กำหนด model
# input_dim = X_train.shape[1]
# num_classes = 2

# model = MultiTaskTabNet(input_dim, n_classes=num_classes, group_attention_matrix=torch.eye(input_dim))

# # กำหนด optimizer และ loss functions
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
# loss_class = nn.CrossEntropyLoss()  # สำหรับ Classification
# loss_reg = nn.MSELoss()  # สำหรับ Regression

# # สร้าง DataLoader สำหรับ training และ test
# train_data = TensorDataset(X_train, y_train_class, y_train_reg)
# test_data = TensorDataset(X_test, y_test_class, y_test_reg)

# train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_data, batch_size=256, shuffle=False)

# # กำหนดตัวแปรเก็บค่า loss และ metric
# train_loss_history = []
# test_loss_history = []
# train_mse_history = []
# test_mse_history = []
# train_r2_history = []
# test_r2_history = []
# test_accuracy_history = []
# train_accuracy_history = []

# # Training loop
# epochs = 100
# for epoch in range(epochs):
#     model.train()

#     train_preds_reg = []
#     train_targets_reg = []
#     train_preds_class = []
#     train_targets_class = []

#     # Loop through batches of data from train loader
#     for X_batch, y_batch_class, y_batch_reg in train_loader:
#         optimizer.zero_grad()

#         # Forward pass
#         class_pred, reg_pred = model(X_batch)

#         # พิมพ์ shape หลังจาก forward pass
#         # print("y_batch_reg shape:", y_batch_reg.shape)
#         # print("reg_pred shape after squeeze:", reg_pred.squeeze().shape)

#         # คำนวณ loss
#         loss1 = loss_class(class_pred, y_batch_class)
#         loss2 = loss_reg(reg_pred.squeeze(), y_batch_reg)

#         total_loss = loss1 + loss2

#         # Backpropagation
#         total_loss.backward()
#         # Gradient Clipping
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
#         optimizer.step()

#         # เก็บค่าทุก batch สำหรับคำนวณทีหลัง
#         train_preds_reg.append(reg_pred.detach().cpu().numpy())
#         train_targets_reg.append(y_batch_reg.detach().cpu().numpy())

#         train_preds_class.append(torch.argmax(class_pred, dim=1).detach().cpu().numpy())
#         train_targets_class.append(y_batch_class.detach().cpu().numpy())

#     # ✅ แปลง List เป็น numpy array ก่อนคำนวณ
#     train_preds_reg = np.concatenate(train_preds_reg, axis=0)
#     train_targets_reg = np.concatenate(train_targets_reg, axis=0)

#     train_preds_class = np.concatenate(train_preds_class, axis=0)
#     train_targets_class = np.concatenate(train_targets_class, axis=0)

#     train_r2 = r2_score(train_targets_reg, train_preds_reg.squeeze())  # R² score
#     train_accuracy = (train_preds_class == train_targets_class).mean()  # Accuracy

#     # เก็บค่า loss
#     train_loss_history.append(total_loss.item())
#     train_mse_history.append(loss2.item())
#     train_r2_history.append(train_r2)
#     train_accuracy_history.append(train_accuracy)

#     print(f"Epoch {epoch+1}/{epochs}, "
#           f"Train Classification Loss: {loss1.item():.4f}, Train Classification Accuracy: {train_accuracy:.4f}, "
#           f"Train Regression Loss (MSE): {loss2.item():.4f}, Train Regression R²: {train_r2:.4f}")

#     # Evaluation mode
#     model.eval()
#     test_preds_reg = []
#     test_targets_reg = []
#     test_preds_class = []
#     test_targets_class = []

#     with torch.no_grad():
#         for X_batch, y_batch_class, y_batch_reg in test_loader:
#             class_pred_test, reg_pred_test = model(X_batch)

#             # คำนวณค่า loss สำหรับ test
#             test_loss_class = loss_class(class_pred_test, y_batch_class).item()
#             test_loss_reg = loss_reg(reg_pred_test.squeeze(), y_batch_reg).item()

#             test_loss = test_loss_class + test_loss_reg
#             test_loss_history.append(test_loss)

#             # เก็บค่าทุก batch สำหรับ test
#             test_preds_reg.append(reg_pred_test.detach().cpu().numpy())
#             test_targets_reg.append(y_batch_reg.detach().cpu().numpy())

#             test_preds_class.append(torch.argmax(class_pred_test, dim=1).detach().cpu().numpy())
#             test_targets_class.append(y_batch_class.detach().cpu().numpy())

#     # ✅ แปลง List เป็น numpy array ก่อนคำนวณ
#     test_preds_reg = np.concatenate(test_preds_reg, axis=0)
#     test_targets_reg = np.concatenate(test_targets_reg, axis=0)

#     test_preds_class = np.concatenate(test_preds_class, axis=0)
#     test_targets_class = np.concatenate(test_targets_class, axis=0)

#     test_r2 = r2_score(test_targets_reg, test_preds_reg.squeeze())
#     test_accuracy = (test_preds_class == test_targets_class).mean()

#     test_r2_history.append(test_r2)
#     test_mse_history.append(test_loss_reg)
#     test_accuracy_history.append(test_accuracy)

#     print(f"Epoch {epoch+1}/{epochs}, "
#           f"Test Classification Loss: {test_loss_class:.4f}, Test Classification Accuracy: {test_accuracy:.4f}, "
#           f"Test Regression Loss (MSE): {test_loss_reg:.4f}, Test Regression R²: {test_r2:.4f}")
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import r2_score
from torch.utils.data import DataLoader, TensorDataset

# กำหนดอุปกรณ์เป็น CPU
device = torch.device("cpu")

# กำหนด model
input_dim = X_train.shape[1]
num_classes = 2
model = MultiTaskTabNet(input_dim, n_classes=num_classes, group_attention_matrix=torch.eye(input_dim)).to(device)

# กำหนด optimizer และ loss functions
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
loss_class = nn.CrossEntropyLoss()  # สำหรับ Classification
loss_reg = nn.MSELoss()  # สำหรับ Regression

# สร้าง DataLoader สำหรับ training และ test
train_data = TensorDataset(X_train, y_train_class, y_train_reg)
test_data = TensorDataset(X_test, y_test_class, y_test_reg)

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=256, shuffle=False)

# Training loop
epochs = 100

# Early stopping parameters
best_test_r2 = -np.inf
patience = 10
counter = 0

for epoch in range(epochs):
    model.train()

    train_preds_reg, train_targets_reg = [], []
    train_preds_class, train_targets_class = [], []

    for X_batch, y_batch_class, y_batch_reg in train_loader:
        X_batch = X_batch.to(device)
        y_batch_class = y_batch_class.to(device)
        y_batch_reg = y_batch_reg.to(device)

        optimizer.zero_grad()

        class_pred, reg_pred = model(X_batch)

        loss1 = loss_class(class_pred, y_batch_class)
        loss2 = loss_reg(reg_pred.squeeze(), y_batch_reg)
        total_loss = loss1 + loss2

        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_preds_reg.append(reg_pred.detach().cpu().numpy())
        train_targets_reg.append(y_batch_reg.detach().cpu().numpy())
        train_preds_class.append(torch.argmax(class_pred, dim=1).detach().cpu().numpy())
        train_targets_class.append(y_batch_class.detach().cpu().numpy())

    train_preds_reg = np.concatenate(train_preds_reg)
    train_targets_reg = np.concatenate(train_targets_reg)
    train_preds_class = np.concatenate(train_preds_class)
    train_targets_class = np.concatenate(train_targets_class)

    train_r2 = r2_score(train_targets_reg, train_preds_reg.squeeze())
    train_accuracy = (train_preds_class == train_targets_class).mean()

    print(f"Epoch {epoch+1}/{epochs}, "
          f"Train Class Loss: {loss1.item():.4f}, Acc: {train_accuracy:.4f}, "
          f"Reg Loss: {loss2.item():.4f}, R2: {train_r2:.4f}")

    # Evaluation
    model.eval()
    test_preds_reg, test_targets_reg = [], []
    test_preds_class, test_targets_class = [], []



    with torch.no_grad():
        for X_batch, y_batch_class, y_batch_reg in test_loader:
            X_batch = X_batch.to(device)
            y_batch_class = y_batch_class.to(device)
            y_batch_reg = y_batch_reg.to(device)

            class_pred_test, reg_pred_test = model(X_batch)

            test_preds_reg.append(reg_pred_test.detach().cpu().numpy())
            test_targets_reg.append(y_batch_reg.detach().cpu().numpy())
            test_preds_class.append(torch.argmax(class_pred_test, dim=1).detach().cpu().numpy())
            test_targets_class.append(y_batch_class.detach().cpu().numpy())

    test_preds_reg = np.concatenate(test_preds_reg)
    test_targets_reg = np.concatenate(test_targets_reg)
    test_preds_class = np.concatenate(test_preds_class)
    test_targets_class = np.concatenate(test_targets_class)

    test_r2 = r2_score(test_targets_reg, test_preds_reg.squeeze())
    test_accuracy = (test_preds_class == test_targets_class).mean()

    print(f"Epoch {epoch+1}/{epochs}, "
          f"Test Class Acc: {test_accuracy:.4f}, Test Reg R2: {test_r2:.4f}")
    
        # === Early Stopping logic ===
    if test_r2 > best_test_r2 + 1e-4:
        best_test_r2 = test_r2
        counter = 0
        best_model_state = model.state_dict()  # Save best model
    else:
        counter += 1
        if counter >= patience:
            print(f"\n⛔ Early stopping at epoch {epoch+1}. Best R²: {best_test_r2:.4f}")
            break



Epoch 1/100, Train Class Loss: 0.0055, Acc: 0.9222, Reg Loss: 0.0675, R2: 0.7264
Epoch 1/100, Test Class Acc: 0.9979, Test Reg R2: 0.9530
Epoch 2/100, Train Class Loss: 4.3656, Acc: 0.9965, Reg Loss: 2.0850, R2: 0.9350
Epoch 2/100, Test Class Acc: 0.9997, Test Reg R2: 0.9601
Epoch 3/100, Train Class Loss: 0.0005, Acc: 0.9988, Reg Loss: 0.0006, R2: 0.9399
Epoch 3/100, Test Class Acc: 1.0000, Test Reg R2: 0.9611
Epoch 4/100, Train Class Loss: 0.0025, Acc: 0.9993, Reg Loss: 0.0280, R2: 0.9415
Epoch 4/100, Test Class Acc: 0.9998, Test Reg R2: 0.9593
Epoch 5/100, Train Class Loss: 6.8488, Acc: 0.9994, Reg Loss: 4.4356, R2: 0.9411
Epoch 5/100, Test Class Acc: 1.0000, Test Reg R2: 0.9585
Epoch 6/100, Train Class Loss: 5.7777, Acc: 0.9996, Reg Loss: 4.3160, R2: 0.9426
Epoch 6/100, Test Class Acc: 1.0000, Test Reg R2: 0.9609
Epoch 7/100, Train Class Loss: 4.1449, Acc: 0.9994, Reg Loss: 1.9499, R2: 0.9442
Epoch 7/100, Test Class Acc: 1.0000, Test Reg R2: 0.9607
Epoch 8/100, Train Class Loss: 6.0

In [47]:
# import torch
# import torch.nn as nn
# from sklearn.metrics import r2_score
# from torch.utils.data import DataLoader, TensorDataset

# # Define model
# input_dim = X_train.shape[1]
# num_classes = len(valid_states)
# print(input_dim)
# # num_classes = 1

# model = MultiTaskTabNet(input_dim, n_classes=num_classes, group_attention_matrix=torch.eye(input_dim))

# # Define optimizer and loss functions
# # optimizer = torch.optim.Adam(model.parameters(), lr=0.00001) # 0.01 ดี
# optimizer = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.9, weight_decay=0.01)
# # optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)

# # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
# loss_class = nn.CrossEntropyLoss()  # For classification
# loss_reg = nn.MSELoss()  # Use Mean Squared Error (MSE) for regression

# train_loss_history = []
# test_loss_history = []
# train_mse_history = []
# test_mse_history = []
# train_r2_history = []
# test_r2_history = []
# test_accuracy_history = []
# train_accuracy_history = []


# # early_stopping = EarlyStopping(patience=10, verbose=True)
# # Training loop
# epochs = 50
# for epoch in range(epochs):
#     model.train()
#     optimizer.zero_grad()

#     # Forward pass
#     class_pred, reg_pred = model(X_train)

#     # Compute losses
#     loss1 = loss_class(class_pred, y_train_class)
#     loss2 = loss_reg(reg_pred.squeeze(), y_train_reg)  # MSE loss for regression

#     # Total loss
#     total_loss = loss1 + loss2

#     # Backpropagation
#     total_loss.backward()
#     optimizer.step()
    
#     # scheduler.step()  

#     # Append train loss and MSE history
#     train_loss_history.append(total_loss.item())

#     # Calculate training MSE and R²
#     train_mse = loss2.item()  # MSE for regression
#     train_mse_history.append(train_mse)
#     train_r2 = r2_score(y_train_reg.numpy(), reg_pred.squeeze().detach().numpy())
#     train_r2_history.append(train_r2)

#     # Compute train accuracy
#     predicted_labels_train = torch.argmax(class_pred, dim=1)
#     train_accuracy = (predicted_labels_train == y_train_class).float().mean().item()
#     train_accuracy_history.append(train_accuracy)

#     # Print results for the current epoch
#     print(f"Epoch {epoch+1}/{epochs}, "
#           f"Train Classification Loss: {loss1.item():.4f}, Train Classification Accuracy: {train_accuracy:.4f}, "
#           f"Train Regression Loss (MSE): {loss2.item():.4f}, Train Regression MSE: {train_mse:.4f}, "
#           f"Train Regression R²: {train_r2:.4f}")

#     model.eval()
#     with torch.no_grad():
#         class_pred_test, reg_pred_test = model(X_test)

#         # Compute test loss
#         test_loss_class = loss_class(class_pred_test, y_test_class).item()
#         test_loss_reg = loss_reg(reg_pred_test.squeeze(), y_test_reg).item()

#         test_loss = test_loss_class + test_loss_reg
#         test_loss_history.append(test_loss)

#         # Compute test accuracy
#         predicted_labels_test = torch.argmax(class_pred_test, dim=1)
#         test_accuracy = (predicted_labels_test == y_test_class).float().mean().item()
#         test_accuracy_history.append(test_accuracy)

#         # Calculate R² score for regression
#         r2_test = r2_score(y_test_reg.numpy(), reg_pred_test.squeeze().numpy())
#         test_r2_history.append(r2_test)

#         # Calculate MSE for test data
#         test_mse = loss_reg(reg_pred_test.squeeze(), y_test_reg).item()
#         test_mse_history.append(test_mse)

        
#     # early_stopping(val_loss, model)    
#     # if early_stopping.early_stop:
#     #     print("Early stopping")
#     #     break
#     # # After each epoch, print test results
#     print(f"Epoch {epoch+1}/{epochs}, "
#           f"Test Classification Loss: {test_loss_class:.4f}, Test Classification Accuracy: {test_accuracy:.4f}, "
#           f"Test Regression Loss (MSE): {test_loss_reg:.4f}, Test Regression R²: {r2_test:.4f}, "
#           f"Test MSE: {test_mse:.4f}")

#     # Switch back to train mode for the next epoch
#     model.train()



In [48]:
# Final results after all epochs
final_test_loss_class = test_loss_class
final_test_accuracy = test_accuracy
final_test_loss_reg = test_loss_reg
final_r2_score = test_r2

# Final results print in desired format
print(f"Test Classification Loss: {final_test_loss_class:.4f}")
print(f"Test Classification Accuracy: {final_test_accuracy:.4f}")
print(f"Test Regression Loss (MSE): {final_test_loss_reg:.4f}")
print(f"Test Regression R² Score: {final_r2_score:.4f}")

NameError: name 'test_loss_class' is not defined

In [None]:
# --- Plot Training and Test Loss ---
plt.figure(figsize=(12, 5))
plt.plot(train_loss_history, linestyle='-', label='Train Loss')
plt.plot(test_loss_history, linestyle='-', label='Test Loss')
plt.title('Training and Test Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

# --- Plot Training and Test MSE ---
plt.figure(figsize=(12, 5))
plt.plot(train_mse_history, linestyle='-', label='Train MSE')
plt.plot(test_mse_history, linestyle='-', label='Test MSE', color='orange')
plt.title('Training and Test MSE Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.legend()
plt.grid(True)
plt.show()

# --- Plot Train and Test Accuracy ---
plt.figure(figsize=(12, 5))
plt.plot(train_accuracy_history, linestyle='-', label='Train Accuracy')
plt.plot(test_accuracy_history, linestyle='-', label='Test Accuracy', color='orange')
plt.title('Train and Test Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

# --- Plot Train and Test R² Score ---
plt.figure(figsize=(12, 5))
plt.plot(train_r2_history, linestyle='-', label='Train R² Score')
plt.plot(test_r2_history, linestyle='-', label='Test R² Score', color='orange')
plt.title('Train and Test R² Score Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('R² Score')
plt.legend()
plt.grid(True)
plt.show()