# Income Classification

## PyTorch

### Library

In [14]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

### Datasets

In [None]:
df = pd.read_csv('income.csv')
# df.info()

df.isnull().sum()

In [16]:
df.columns = df.columns.str.strip()

imputer = SimpleImputer(strategy='mean')
df.loc[:, df.select_dtypes(include=[np.number]).columns] = imputer.fit_transform(df.select_dtypes(include=[np.number]))

df = pd.get_dummies(df, drop_first=True)
# df.isnull().sum()

Unnamed: 0,0
age,0
fnlwgt,0
education-num,0
capital-gain,0
capital-loss,0
...,...
native-country_Vietnam,0
native-country_Yugoslavia,0
income_<=50K.,0
income_>50K,0


### Input & Output

In [17]:
income_cols = df.filter(like="income").columns.tolist()

if not income_cols:
    raise ValueError(" Kolom target tidak ditemukan!")
y = df[income_cols[0]]
X = df.drop(columns=income_cols)

scaler_X = StandardScaler()
X = scaler_X.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=20)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

### Model & Training

In [26]:
# Define the neural network class
class IncomeClassifier(nn.Module):
    def __init__(self, input_dim):
        super(IncomeClassifier, self).__init__()
        # Define hidden layers
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

# Initialize model, loss function, and optimizer
input_dim = X_train.shape[1]
model_torch = IncomeClassifier(input_dim)
criterion = nn.BCELoss()
optimizer = optim.Adam(model_torch.parameters(), lr=0.001, weight_decay=1e-5)

In [27]:
epochs = 200
for epoch in range(epochs):
    model_torch.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model_torch(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if (epoch+1) % 10 == 0:
      print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 10/200, Loss: 0.5225
Epoch 20/200, Loss: 0.4809
Epoch 30/200, Loss: 0.4372
Epoch 40/200, Loss: 0.4022
Epoch 50/200, Loss: 0.3757
Epoch 60/200, Loss: 0.3537
Epoch 70/200, Loss: 0.3430
Epoch 80/200, Loss: 0.3213
Epoch 90/200, Loss: 0.3071
Epoch 100/200, Loss: 0.2982
Epoch 110/200, Loss: 0.3001
Epoch 120/200, Loss: 0.2815
Epoch 130/200, Loss: 0.2786
Epoch 140/200, Loss: 0.2718
Epoch 150/200, Loss: 0.2629
Epoch 160/200, Loss: 0.2522
Epoch 170/200, Loss: 0.2527
Epoch 180/200, Loss: 0.2476
Epoch 190/200, Loss: 0.2472
Epoch 200/200, Loss: 0.2402


### Eval

In [28]:
model_torch.eval()
y_pred = []
y_true = []
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model_torch(batch_X)
        predictions = (outputs >= 0.5).float()
        y_pred.extend(predictions.numpy())
        y_true.extend(batch_y.numpy())

y_pred = np.array(y_pred)
y_true = np.array(y_true)

# Compute evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

Accuracy: 0.6640
Precision: 0.3009
Recall: 0.2366
F1 Score: 0.2649
AUC: 0.5238


In [None]:
import matplotlib.pyplot as plt

fpr, tpr, _ = roc_curve(y_true, y_pred)
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.4f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## TensorFlow

### Library

In [30]:
import tensorflow as tf
from tensorflow import keras

### Input & Output

In [31]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train.values.reshape(-1, 1)))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test.values.reshape(-1, 1)))

train_dataset = train_dataset.batch(32).shuffle(buffer_size=len(X_train))
test_dataset = test_dataset.batch(32)

input_dim = X_train.shape[1]

### Model & Training

In [32]:
model_tf = keras.Sequential([
    keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model_tf.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [33]:
# Add weight decay (L2 regularization)
for layer in model_tf.layers:
    if isinstance(layer, keras.layers.Dense):
        layer.kernel_regularizer = keras.regularizers.l2(1e-5)

# Train the model
epochs = 200
history = model_tf.fit(
    train_dataset,
    epochs=epochs,
    verbose=0,
    callbacks=[
        keras.callbacks.LambdaCallback(
            on_epoch_end=lambda epoch, logs: print(f"Epoch {epoch+1}/{epochs}, Loss: {logs['loss']:.4f}")
            if (epoch+1) % 10 == 0 else None
        )
    ]
)

Epoch 10/200, Loss: 0.5140
Epoch 20/200, Loss: 0.4558
Epoch 30/200, Loss: 0.4070
Epoch 40/200, Loss: 0.3702
Epoch 50/200, Loss: 0.3440
Epoch 60/200, Loss: 0.3205
Epoch 70/200, Loss: 0.3048
Epoch 80/200, Loss: 0.2910
Epoch 90/200, Loss: 0.2832
Epoch 100/200, Loss: 0.2663
Epoch 110/200, Loss: 0.2667
Epoch 120/200, Loss: 0.2546
Epoch 130/200, Loss: 0.2482
Epoch 140/200, Loss: 0.2466
Epoch 150/200, Loss: 0.2334
Epoch 160/200, Loss: 0.2313
Epoch 170/200, Loss: 0.2254
Epoch 180/200, Loss: 0.2184
Epoch 190/200, Loss: 0.2221
Epoch 200/200, Loss: 0.2185


### Eval

In [35]:
y_pred_proba = model_tf.predict(test_dataset)
y_pred = (y_pred_proba >= 0.5).astype(float)
y_true = y_test.values.reshape(-1, 1)

# Compute evaluation metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")

[1m458/458[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Accuracy: 0.6569
Precision: 0.2987
Recall: 0.2531
F1 Score: 0.2740
AUC: 0.5244


## Penjelasan

Metriks Evaluasi
1.   Akurasi
Besaran proporsi data yang diklasifikasikan dengan benar di antara jumlah total kasus. Rumusnya
> *(TP+TN)/(TP+TN+FP+FN)*

2.   Presisi
Mengukur seberapa banyak prediksi positif yang benar-benar positif. Rumusnya
>  *TP/(TP+FP)*

3.   Recall
Mengukur berapa banyak positif aktual yang diidentifikasi dengan benar oleh model. Rumusnya
>  *TP/(TP+FN)*

4.   F1 Score
Rata-rata harmonik dari presisi dan recall. Rumusnya
> 2 * (Presisi * Recall)/(Presisi+Recall)

5.   Area Under Curve (AUC)
 Mengukur kemampuan model untuk membedakan antara kelas positif dengan kelas negatif

6.   Receiver Operating Characteristic (ROC)
Kurva yang memetakan rasio positif sejati terhadap rasio positif palsu pada berbagai parameter





## Metriks Terbaik: F1 Score

Melihat datasets ('income.csv'), model ini mencoba menyelesaikan masalah klasifikasi biner. Target berupa pendapatan seseorang ≤ USD50.000 or > USD50.000. *Data tidak seimbang* karena jumlah data yang di bawah ≤ US$50K lebih banyak.

**Oleh karena itu, F1 Score adalah metrik evaluasi terbaik untuk implementasi ini:**

*   Metrik ini menyeimbangkan presisi dan perolehan, yang penting saat menangani kelas yang tidak seimbang
*   Metrik ini menghukum trade-off ekstrem antara presisi dan perolehan
*   Metrik ini menyediakan satu metrik yang memperhitungkan positif palsu dan negatif palsu
