In [13]:
from pathlib import Path
import pandas as pd

colums = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]

data_dir = Path.cwd().parent / "data"
train_path = data_dir / "adult.data"
test_path = data_dir / "adult.test"

train_data = pd.read_csv(train_path, names = colums, na_values = ' ?')
test_data = pd.read_csv(test_path, names = colums, na_values = ' ?', skiprows = 1)

print(train_data.head())
print(train_data.shape)
print(test_data.shape)


   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [17]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [18]:
print(train_data.shape)
print(test_data.shape)

(30162, 15)
(15060, 15)


In [None]:
X_train = train_data.drop('income', axis = 1)
y_train = train_data['income'].map({' <=50K': 0, ' >50K': 1})

X_test = test_data.drop('income', axis = 1)
y_test = test_data['income'].map({' <=50K.': 0, ' >50K.': 1})
print(X_train.shape, X_test.shape) # Before one-hot feature count

categorical_cols = X_train.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

X_train = pd.get_dummies(X_train, columns=categorical_cols)
X_test = pd.get_dummies(X_test, columns=categorical_cols)

X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)
print(X_train.shape, X_test.shape) # After one-hot feature count

(30162, 14) (15060, 14)
Categorical columns: Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')
(30162, 104) (15060, 104)


In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

train_acc = accuracy_score(y_train, train_pred)
test_acc = accuracy_score(y_test, test_pred)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")

Training Accuracy: 1.0000
Testing Accuracy: 0.8047
