In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset, DataLoader

In [4]:
df = pd.read_csv("income.csv")
df

Unnamed: 0,age,sex,education,education-num,marital-status,workclass,occupation,hours-per-week,income,label
0,27,Male,HS-grad,9,Never-married,Private,Craft-repair,40,<=50K,0
1,47,Male,Masters,14,Married,Local-gov,Exec-managerial,50,>50K,1
2,59,Male,HS-grad,9,Divorced,Self-emp,Prof-specialty,20,<=50K,0
3,38,Female,Prof-school,15,Never-married,Federal-gov,Prof-specialty,57,>50K,1
4,64,Female,11th,7,Widowed,Private,Farming-fishing,40,<=50K,0
...,...,...,...,...,...,...,...,...,...,...
29995,45,Male,Masters,14,Married,State-gov,Prof-specialty,60,>50K,1
29996,33,Male,HS-grad,9,Married,Private,Machine-op-inspct,40,>50K,1
29997,47,Male,Prof-school,15,Married,Private,Prof-specialty,55,>50K,1
29998,32,Female,Some-college,10,Never-married,Private,Adm-clerical,40,<=50K,0


In [5]:
cat_col = [col for col in df.columns if df[col].dtype == "object" and col != "income"]
cont_col = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col != 'label']
label_col = 'income'

In [6]:
label_enc = LabelEncoder()
df[label_col] = label_enc.fit_transform(df[label_col])

In [7]:
cat_encoders = {}
for cat in cat_col:
    le = LabelEncoder()
    df[cat] = le.fit_transform(df[cat])
    cat_encoders[cat] = le

cat_encoders

{'sex': LabelEncoder(),
 'education': LabelEncoder(),
 'marital-status': LabelEncoder(),
 'workclass': LabelEncoder(),
 'occupation': LabelEncoder()}

In [8]:
x_cats = df[cat_col].values
x_conts = df[cont_col].values
y = df[label_col].values

In [9]:
scaler = StandardScaler()
x_conts = scaler.fit_transform(x_conts)

In [10]:
X_cats_train, X_cats_test, X_conts_train, X_conts_test, y_train, y_test = train_test_split(x_cats, x_conts, y, test_size=0.2, random_state=42)

In [11]:
cats_train = torch.tensor(X_cats_train, dtype=torch.int64)
conts_train = torch.tensor(X_conts_train, dtype=torch.float)
y_train = torch.tensor(y_train, dtype=torch.float)

In [12]:
cats_test = torch.tensor(X_cats_test, dtype=torch.int64)
conts_test = torch.tensor(X_conts_test, dtype=torch.float)
y_test = torch.tensor(y_test, dtype=torch.float)

In [13]:
train_ds = TensorDataset(cats_train, conts_train, y_train)
test_ds = TensorDataset(cats_test, conts_test, y_test)

In [14]:
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64)

In [15]:
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz=2, hidden_units=50, dropout=0.4):
        super().__init__()
        self.embeds = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in emb_szs])
        self.emb_drop = nn.Dropout(dropout)
        self.bn_cont = nn.BatchNorm1d(n_cont)
        n_emb = sum([nf for _, nf in emb_szs])
        self.fc1 = nn.Linear(n_emb + n_cont, hidden_units)
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_units, out_sz)

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeds)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x_cont = self.bn_cont(x_cont)
        x = torch.cat([x, x_cont], 1)
        x = F.relu(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)
        return x

In [16]:
emb_sizes = [(len(df[col].unique()), min(50, (len(df[col].unique())+1)//2)) for col in cat_col]

In [17]:
model = TabularModel(emb_sizes, n_cont=len(cont_col))
torch.manual_seed(42)

<torch._C.Generator at 0x78cf70a3cab0>

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 300

In [19]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for cat, cont, y in train_dl:
        optimizer.zero_grad()
        y_pred = model(cat, cont)
        loss = criterion(y_pred, y.long())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch+1) % 50 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_dl):.4f}")

Epoch 50/300, Loss: 0.2693
Epoch 100/300, Loss: 0.2669
Epoch 150/300, Loss: 0.2640
Epoch 200/300, Loss: 0.2626
Epoch 250/300, Loss: 0.2682
Epoch 300/300, Loss: 0.2655


In [20]:
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
    for cat, cont, y in test_dl:
        y_pred = model(cat, cont)
        loss = criterion(y_pred, y.long())
        test_loss += loss.item()
        correct += (y_pred.argmax(dim=1) == y).sum().item()
print(f"Test Loss: {test_loss/len(test_dl):.4f}, Accuracy: {correct/len(test_ds):.4f}")

Test Loss: 0.2526, Accuracy: 0.8832


In [22]:
def predict_new(model, input_dict):
    # categorical part
    cat_vals = []
    for c in cat_col:
        le = cat_encoders[c]
        val = input_dict[c]
        enc = le.transform([val])[0]
        cat_vals.append(enc)
    cat_tensor = torch.tensor([cat_vals], dtype=torch.int64)
    # continuous part
    cont_vals = [input_dict[c] for c in cont_col]
    cont_scaled = scaler.transform([cont_vals])
    cont_tensor = torch.tensor(cont_scaled, dtype=torch.float32)
    model.eval()
    with torch.no_grad():
        out = model(cat_tensor, cont_tensor)
        pred_class = out.argmax(1).item()
        return label_enc.inverse_transform([pred_class])[0]

In [28]:
new_person = {
    'age': 23,
    'sex': 'Female',
    'education': 'HS-grad',
    'education-num': 9,
    'marital-status': 'Never-married',
    'workclass': 'Private',
    'occupation': 'Adm-clerical',
    'hours-per-week': 30
}
print("Predicted Income:", predict_new(model, new_person))


Predicted Income: <=50K
