Lokesh M
212223230114

In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data = pd.read_csv("income.csv")

In [None]:
# Display information
print("Shape:", data.shape)
print(data.head())

Shape: (30000, 10)
   age     sex    education  education-num marital-status    workclass  \
0   27    Male      HS-grad              9  Never-married      Private   
1   47    Male      Masters             14        Married    Local-gov   
2   59    Male      HS-grad              9       Divorced     Self-emp   
3   38  Female  Prof-school             15  Never-married  Federal-gov   
4   64  Female         11th              7        Widowed      Private   

        occupation  hours-per-week income  label  
0     Craft-repair              40  <=50K      0  
1  Exec-managerial              50   >50K      1  
2   Prof-specialty              20  <=50K      0  
3   Prof-specialty              57   >50K      1  
4  Farming-fishing              40  <=50K      0  


In [4]:
label_col = 'income' 

In [5]:
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove(label_col)
continuous_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical:", categorical_cols)
print("Continuous:", continuous_cols)
print("Label:", label_col)

Categorical: ['sex', 'education', 'marital-status', 'workclass', 'occupation']
Continuous: ['age', 'education-num', 'hours-per-week', 'label']
Label: income


In [6]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

# Encode label column
target_encoder = LabelEncoder()
data[label_col] = target_encoder.fit_transform(data[label_col])

In [7]:
# Scale continuous columns
scaler = StandardScaler()
data[continuous_cols] = scaler.fit_transform(data[continuous_cols])

In [8]:
train_data = data.iloc[:25000]
test_data = data.iloc[25000:]

In [9]:
cat_train = torch.tensor(train_data[categorical_cols].values, dtype=torch.int64)
con_train = torch.tensor(train_data[continuous_cols].values, dtype=torch.float)
y_train = torch.tensor(train_data[label_col].values, dtype=torch.long)

cat_test = torch.tensor(test_data[categorical_cols].values, dtype=torch.int64)
con_test = torch.tensor(test_data[continuous_cols].values, dtype=torch.float)
y_test = torch.tensor(test_data[label_col].values, dtype=torch.long)

In [10]:
# Create DataLoaders
train_ds = TensorDataset(cat_train, con_train, y_train)
test_ds = TensorDataset(cat_test, con_test, y_test)
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

In [11]:
# define model
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, hidden_sz=50, dropout=0.4):
        super().__init__()
        # Embeddings for categorical variables
        self.embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
        self.emb_drop = nn.Dropout(dropout)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        
        # Layers
        self.fc1 = nn.Linear(sum([nf for _, nf in emb_szs]) + n_cont, hidden_sz)
        self.bn1 = nn.BatchNorm1d(hidden_sz)
        self.fc2 = nn.Linear(hidden_sz, out_sz)
        self.out = nn.Softmax(dim=1)
        self.drop = nn.Dropout(dropout)

    def forward(self, x_cat, x_cont):
        # Embedding categorical data
        embeddings = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(embeddings, 1)
        x = self.emb_drop(x)
        
        # Normalize continuous
        x_cont = self.bn_cont(x_cont)
        
        # Combine
        x = torch.cat([x, x_cont], 1)
        x = self.drop(torch.relu(self.bn1(self.fc1(x))))
        x = self.fc2(x)
        return x


In [12]:
cat_szs = [len(label_encoders[col].classes_) for col in categorical_cols]
emb_szs = [(size, min(50, (size + 1) // 2)) for size in cat_szs]

In [14]:

model = TabularModel(emb_szs, len(continuous_cols), 2, hidden_sz=50, dropout=0.4)


In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [16]:
#Evaluation
epochs = 300
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for cat, con, label in train_loader:
        optimizer.zero_grad()
        output = model(cat, con)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    if (epoch+1) % 50 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

Epoch [50/300], Loss: 0.0000
Epoch [100/300], Loss: 0.0000
Epoch [150/300], Loss: 0.0000
Epoch [200/300], Loss: 0.0000
Epoch [250/300], Loss: 0.0000
Epoch [300/300], Loss: 0.0000


In [17]:
model.eval()
correct, total, test_loss = 0, 0, 0.0
with torch.no_grad():
    for cat, con, label in test_loader:
        outputs = model(cat, con)
        loss = criterion(outputs, label)
        test_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == label).sum().item()
        total += label.size(0)

accuracy = correct / total
print(f"\nTest Loss: {test_loss/len(test_loader):.4f}")
print(f"Test Accuracy: {accuracy*100:.2f}%")


Test Loss: 0.0000
Test Accuracy: 100.00%
