In [1]:
!pip install pytorch-tabnet



In [2]:
import pandas as pd
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.metrics import accuracy_score

In [3]:
dataset_train = pd.read_csv("new_train_v1.csv")
dataset_train.head()

Unnamed: 0,Year,Month,Consumer_type,Consumption,Consumer_number,Installation_zone
0,2013,1,domestic,0,HKNY84394595510414,Installation_zone 3
1,2013,1,domestic,11,YAWF03698196960770,Installation_zone 3
2,2013,1,domestic,12,IMUT47685842447678,Installation_zone 3
3,2013,1,domestic,1,FFKA67947641824695,Installation_zone 3
4,2013,1,domestic,6,PAPH56002797946420,Installation_zone 3


In [4]:
dataset_test = pd.read_csv("test_v1.csv")
dataset_test.head()

Unnamed: 0,Year,Month,Consumer_type,Consumption,Consumer_number,Installation_zone
0,2013,1,domestic,0,MOGV36480546611521,Installation_zone 1
1,2013,1,domestic,6,VRFW65577141436242,Installation_zone 2
2,2013,1,domestic,1,QLLI18662653137621,Installation_zone 2
3,2013,1,domestic,13,HYUO61823402850645,Installation_zone 2
4,2013,1,domestic,31,MXWL75757930683403,Installation_zone 2


In [5]:
le_iz = LabelEncoder()
le_iz.fit(dataset_train['Installation_zone'].unique())

le_ct = LabelEncoder()
le_ct.fit(dataset_train['Consumer_type'].unique())

le_cn = LabelEncoder()
le_cn.fit(dataset_train['Consumer_number'].unique())

dataset_train['Installation_zone'] = le_iz.transform(dataset_train['Installation_zone'])
dataset_train['Consumer_type'] = le_ct.transform(dataset_train['Consumer_type'])
dataset_train['Consumer_number'] = le_cn.transform(dataset_train['Consumer_number'])

In [6]:
le_iz_test = LabelEncoder()
le_iz_test.fit(dataset_test['Installation_zone'].unique())

le_ct_test = LabelEncoder()
le_ct_test.fit(dataset_test['Consumer_type'].unique())

le_cn_test = LabelEncoder()
le_cn_test.fit(dataset_test['Consumer_number'].unique())

dataset_test['Installation_zone'] = le_iz_test.transform(dataset_test['Installation_zone'])
dataset_test['Consumer_type'] = le_ct_test.transform(dataset_test['Consumer_type'])
dataset_test['Consumer_number'] = le_cn_test.transform(dataset_test['Consumer_number'])

In [7]:
X_train = dataset_train.drop('Consumer_type', axis='columns')
y_train = dataset_train['Consumer_type']

X_test = dataset_test.drop('Consumer_type', axis='columns')
y_test = dataset_test['Consumer_type']

In [8]:
# Define TabNet model
clf = TabNetClassifier(
    n_d=8,  # Dimensionality of the model (default is 8)
    n_a=8,  # Attention dimensionality (default is 8)
    n_steps=5,  # Number of steps in the architecture (default is 5)
    gamma=1.5,  # This parameter controls the importance of sparse regularization (default is 1.5)
    n_independent=2,  # Number of independently masked feature columns for each decision step (default is 2)
    n_shared=2,  # Number of shared decision steps (default is 2)
    cat_dims=[X_train[col].nunique() for col in X_train.select_dtypes(include=['object'])],  # Number of categories for each categorical column
    cat_emb_dim=1,  # Embedding dimension for categorical features (default is 1)
    lambda_sparse=0.001,  # This parameter controls the importance of sparse regularization (default is 0.001)
    optimizer_fn=torch.optim.Adam,  # Optimizer used for training (default is Adam)
    optimizer_params=dict(lr=2e-2),  # Learning rate for the optimizer (default is 2e-2)
    scheduler_params={"step_size":50, "gamma":0.9},  # Learning rate scheduler parameters
    scheduler_fn=torch.optim.lr_scheduler.StepLR,  # Learning rate scheduler function
    mask_type='entmax'  # Type of mask to use ('sparsemax' or 'entmax')
)



In [9]:
# Train the model
clf.fit(
    X_train.values, y_train.values,
    eval_set=[(X_train.values, y_train.values), (X_test.values, y_test.values)],
    eval_metric=['accuracy'],
    max_epochs=500,
    patience=20,
    batch_size=int(dataset_train.shape[0]/10),
    virtual_batch_size=int(dataset_train.shape[0]/20),
    num_workers=0,
    weights=1,
    drop_last=False,
    from_unsupervised=None
)

epoch 0  | loss: 1.46411 | val_0_accuracy: 0.16118 | val_1_accuracy: 0.14286 |  0:01:13s
epoch 1  | loss: 0.78123 | val_0_accuracy: 0.16195 | val_1_accuracy: 0.14314 |  0:02:31s
epoch 2  | loss: 0.58156 | val_0_accuracy: 0.27001 | val_1_accuracy: 0.13771 |  0:03:42s
epoch 3  | loss: 0.49844 | val_0_accuracy: 0.15875 | val_1_accuracy: 0.142   |  0:04:55s
epoch 4  | loss: 0.46321 | val_0_accuracy: 0.15377 | val_1_accuracy: 0.13914 |  0:06:06s
epoch 5  | loss: 0.43949 | val_0_accuracy: 0.05113 | val_1_accuracy: 0.15543 |  0:07:16s
epoch 6  | loss: 0.42328 | val_0_accuracy: 0.03693 | val_1_accuracy: 0.15029 |  0:08:29s
epoch 7  | loss: 0.41414 | val_0_accuracy: 0.0927  | val_1_accuracy: 0.15371 |  0:09:39s
epoch 8  | loss: 0.4099  | val_0_accuracy: 0.07317 | val_1_accuracy: 0.14629 |  0:10:49s
epoch 9  | loss: 0.39788 | val_0_accuracy: 0.10369 | val_1_accuracy: 0.13257 |  0:11:58s
epoch 10 | loss: 0.38904 | val_0_accuracy: 0.1039  | val_1_accuracy: 0.14943 |  0:13:09s
epoch 11 | loss: 0.38



In [10]:
# Make predictions on the test set
y_pred = clf.predict(X_test.values)

In [11]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.19542857142857142
