In [None]:
import pandas as pd
import numpy as np

In [None]:

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/My Drive/HEROdata2.csv')

print(df.head())


In [None]:

null_columns = df.columns[df.isnull().all()]

df = df.drop(columns=null_columns)

In [None]:
df = df.iloc[:, :-1]

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Label"])

In [None]:
df.head()

In [None]:
corr_matrix = df.corr().abs()

In [None]:
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [None]:
df = df.drop(df[to_drop], axis=1)

In [None]:
df.head()

In [None]:
X = df.drop(columns=["Label"])
Y = df["Label"]

In [None]:
num_columns = df.shape[1]
print("Number of columns:", num_columns)

In [None]:
# prompt: count in labels column no of 0 and 1

print(df['Label'].value_counts())


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Remove constant columns
df_filtered = df.loc[:, df.nunique() > 1]

# Compute correlation matrix
corr_matrix = df.drop(columns=["Label"]).corr()

# Create an upper triangle mask


# Plot heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(
    corr_matrix,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    linewidths=0.5,
    annot_kws={"size": 8}
)

plt.title("Correlation Matrix Heatmap", fontsize=14)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.show()


In [None]:

df.columns


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

In [None]:
X

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X , Y, stratify=Y, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf

In [None]:

model = tf.keras.models.Sequential()

In [None]:
model.add(tf.keras.layers.Dense(units=512 , activation='relu'))

In [None]:
model.add(tf.keras.layers.Dense(units=256 , activation='relu'))
model.add(tf.keras.layers.Dense(units=128 , activation='relu'))
model.add(tf.keras.layers.Dense(units=64 , activation='relu'))

In [None]:
model.add(tf.keras.layers.Dense(units=1 , activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


TabNet

In [None]:
!pip install pytorch-tabnet

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

In [None]:
print("NaNs in X_train:", np.isnan(X_train).sum())
print("NaNs in X_test:", np.isnan(X_test).sum())

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # or 'median', 'most_frequent'
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


In [None]:
print("NaNs in X_train after cleaning:", np.isnan(X_train).sum())
print("NaNs in X_test after cleaning:", np.isnan(X_test).sum())


In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.utils.class_weight import compute_class_weight
import torch
import numpy as np

# Preserve original labels
y_train_flat = np.array(Y_train).flatten()
y_test_flat = np.array(Y_test).flatten()

# Compute class weights manually
classes = np.unique(y_train_flat)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(Y_train), y=y_train_flat)
class_weights_dict = {int(cls): float(weight) for cls, weight in zip(classes, class_weights)}
print("Class weights:", class_weights_dict)

# Instantiate TabNetClassifier
clf = TabNetClassifier(
    n_d=64,
    n_a=64,
    n_steps=7,
    gamma=1.5,
    lambda_sparse=1e-4,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),
    mask_type='sparsemax',
    scheduler_params={"step_size": 10, "gamma": 0.9},
    verbose=1
)

# Train the model
clf.fit(
    X_train=X_train,
    y_train=y_train_flat,
    eval_set=[(X_train, y_train_flat), (X_test, y_test_flat)],
    eval_name=['train', 'test'],
    eval_metric=['auc', 'balanced_accuracy', 'accuracy'],
    max_epochs=200,
    patience=50,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    weights=class_weights_dict,
    drop_last=False
)


In [None]:

from sklearn.metrics import confusion_matrix
import seaborn as sns

# ... (your existing code) ...

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(Y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# prompt: print accuracy sccore and classfication report for tab net

from sklearn.metrics import accuracy_score, classification_report

# ... (your existing code) ...

# Predict the labels for the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Generate classification report
print(classification_report(Y_test, y_pred))


Tab Transformer

In [None]:
import torch.nn as nn
class TabTransformer(nn.Module):
    def __init__(self, num_features, num_classes, dim_embedding=64, num_heads=4, num_layers=4):
        super(TabTransformer, self).__init__()
        self.embedding = nn.Linear(num_features, dim_embedding)
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim_embedding, nhead=num_heads, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(dim_embedding, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Adding a sequence length dimension
        x = self.transformer(x)
        x = torch.mean(x, dim=1)  # Pooling
        x = self.classifier(x)
        return x

In [None]:
import torch.optim as optim
# Initialize the model, loss, and optimizer
# Assuming X_train is your training data
num_features = X_train.shape[1]  # Get the number of features from your data
num_classes = len(np.unique(Y_train))  # Get the number of unique classes in your target variable

# Initialize the model, loss, and optimizer
model = TabTransformer(num_features, num_classes).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Converting data to tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.LongTensor(Y_train)

# Training loop
for epoch in range(100):
    optimizer.zero_grad()
    output = model(X_train_tensor)
    loss = criterion(output, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item()}')

In [None]:
# Switch the model to evaluation mode
model.eval()

# Disable gradient calculation for evaluation
with torch.no_grad():
    outputs = model(X_train_tensor)
    # For multi-class classification, get the class with the highest score
    _, predicted = torch.max(outputs, dim=1)

    # Calculate the number of correct predictions
    total = y_train_tensor.size(0)
    correct = (predicted == y_train_tensor).sum().item()

    # Compute accuracy
    accuracy = correct / total * 100  # in percentage
    print(f'Training Accuracy: {accuracy:.2f}%')


In [None]:
import torch
from sklearn.metrics import accuracy_score, classification_report

# Convert your test data to PyTorch tensors
X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.LongTensor(Y_test)  # FIXED

# Switch to evaluation mode
model.eval()

with torch.no_grad():
    # Get model outputs and compute loss on the test set
    outputs = model(X_test_tensor)
    loss = criterion(outputs, y_test_tensor)

    # Get predicted class
    _, predicted = torch.max(outputs, dim=1)

# Convert to numpy for sklearn
y_true = y_test_tensor.cpu().numpy()
y_pred = predicted.cpu().numpy()

# Metrics
accuracy = accuracy_score(y_true, y_pred)
report = classification_report(y_true, y_pred)

print(f"Test Loss: {loss.item():.4f}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


In [None]:
from sklearn.metrics import confusion_matrix

# Assume y_true and y_pred are already computed as numpy arrays
cm = confusion_matrix(y_true, y_pred)

# For binary classification, the confusion matrix is 2x2:
# [ [TN, FP],
#   [FN, TP] ]
tn, fp, fn, tp = cm.ravel()

print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("True Positives:", tp)

print([tn, fp])
print([fn, tp])

plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')


In [None]:
# prompt: use some plot to show the diffence in accuracy of tabnet and tab tranformer

import matplotlib.pyplot as plt

# Assuming 'tabnet_accuracy' and 'tabtransformer_accuracy' are the accuracy scores you obtained
tabnet_accuracy = 0.9780  # Replace with your TabNet accuracy
tabtransformer_accuracy = 0.9725 # Replace with your TabTransformer accuracy


models = ['TabNet', 'TabTransformer']
accuracies = [tabnet_accuracy, tabtransformer_accuracy]

plt.figure(figsize=(8, 6))
plt.bar(models, accuracies, color=['skyblue', 'lightcoral'])
plt.xlabel('Model', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.title('Comparison of TabNet and TabTransformer Accuracy', fontsize=14)
plt.ylim(0, 1)  # Set y-axis limit to 0-1 for accuracy
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add accuracy values on top of each bar
for i, v in enumerate(accuracies):
  plt.text(i, v + 0.01, f'{v:.2f}', ha='center', va='bottom', fontsize=10)

plt.show()
