In [60]:
import pandas as pd

categorical_cols = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

df = pd.read_csv(
    "./dataset/adult/adult.csv",
    sep=",",
    names=[
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education-num",
        "marital-status",
        "occupation",
        "relationship",
        "race",
        "sex",
        "capital-gain",
        "capital-loss",
        "hours-per-week",
        "native-country",
        "Income",
    ],
)

# Drop rows with missing values
df.replace(" ?", pd.NA, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [61]:
def cal_matrix(_clf, y_test, y_pred):
    from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
    from sklearn.model_selection import cross_val_score

    print("Confusion_matrix:")
    print(confusion_matrix(y_test, y_pred))
    print()

    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print()

    accuracy_scores = cross_val_score(_clf, X, y, cv=5, scoring="accuracy")
    print(f"Fold accuracies: {accuracy_scores}")
    print(f"Mean cross-validated accuracy: {accuracy_scores.mean():.4f}")

    print(f"Tree Size: {_clf.tree_.node_count}")
    print(f"rootname: {df.columns[_clf.tree_.feature[0]]}")


In [62]:
from sklearn.tree import DecisionTreeClassifier


def make_eval(_clf: DecisionTreeClassifier, X_train, X_test, y_train, y_test):
    _clf.fit(X_train, y_train)
    y_pred = _clf.predict(X_test)
    cal_matrix(_clf, y_test, y_pred)

    return _clf

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

X = df.drop(columns=["Income"]).values
df["Income"] = le.fit_transform(df["Income"])
y = df["Income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [64]:
clf_a = make_eval(
    DecisionTreeClassifier(
        random_state=42,
        criterion="gini",
        ccp_alpha=0,
        splitter="best",
    ),
    X_train,
    X_test,
    y_train,
    y_test,
)

Confusion_matrix:
[[5883  884]
 [ 886 1396]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      6767
           1       0.61      0.61      0.61      2282

    accuracy                           0.80      9049
   macro avg       0.74      0.74      0.74      9049
weighted avg       0.80      0.80      0.80      9049


Fold accuracies: [0.80540361 0.79446378 0.80935013 0.81100796 0.80669761]
Mean cross-validated accuracy: 0.8054
Tree Size: 6467
rootname: relationship


In [65]:
clf_b = make_eval(
    DecisionTreeClassifier(
        random_state=42,
        criterion="entropy",
        ccp_alpha=0,
        splitter="best",
    ),
    X_train,
    X_test,
    y_train,
    y_test,
)

Confusion_matrix:
[[5905  862]
 [ 877 1405]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      6767
           1       0.62      0.62      0.62      2282

    accuracy                           0.81      9049
   macro avg       0.75      0.74      0.74      9049
weighted avg       0.81      0.81      0.81      9049


Fold accuracies: [0.80822145 0.79380076 0.81548408 0.81200265 0.81233422]
Mean cross-validated accuracy: 0.8084
Tree Size: 6257
rootname: relationship


In [66]:
clf_c = make_eval(
    DecisionTreeClassifier(
        random_state=42,
        criterion="gini",
        ccp_alpha=0.001,
        splitter="best",
    ),
    X_train,
    X_test,
    y_train,
    y_test,
)

Confusion_matrix:
[[6481  286]
 [1071 1211]]

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.96      0.91      6767
           1       0.81      0.53      0.64      2282

    accuracy                           0.85      9049
   macro avg       0.83      0.74      0.77      9049
weighted avg       0.85      0.85      0.84      9049


Fold accuracies: [0.84253274 0.84435604 0.84897215 0.85394562 0.85179045]
Mean cross-validated accuracy: 0.8483
Tree Size: 35
rootname: relationship


In [67]:
clf_d = make_eval(
    DecisionTreeClassifier(
        random_state=42,
        criterion="gini",
        ccp_alpha=0.001,
        splitter="random",
    ),
    X_train,
    X_test,
    y_train,
    y_test,
)

Confusion_matrix:
[[6346  421]
 [1087 1195]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.94      0.89      6767
           1       0.74      0.52      0.61      2282

    accuracy                           0.83      9049
   macro avg       0.80      0.73      0.75      9049
weighted avg       0.82      0.83      0.82      9049


Fold accuracies: [0.81021051 0.82711752 0.83255968 0.82327586 0.83372016]
Mean cross-validated accuracy: 0.8254
Tree Size: 41
rootname: marital-status


In [68]:
import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(
    clf_d,
    out_file=None,
    feature_names=df.columns[:-1],
    class_names=le.classes_,
)
graph = graphviz.Source(dot_data)
graph.render("graph")

'graph.pdf'

In [69]:
from sklearn.feature_selection import SelectKBest, f_classif

X_new = SelectKBest(f_classif, k=5).fit_transform(X, y)

X.shape, X_new.shape

((30162, 14), (30162, 5))

In [70]:
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.3, random_state=42
)

clf_d_v2 = make_eval(
    DecisionTreeClassifier(
        random_state=42,
        criterion="gini",
        ccp_alpha=0.001,
        splitter="random",
    ),
    X_train,
    X_test,
    y_train,
    y_test,
)

Confusion_matrix:
[[6447  320]
 [1146 1136]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.95      0.90      6767
           1       0.78      0.50      0.61      2282

    accuracy                           0.84      9049
   macro avg       0.81      0.73      0.75      9049
weighted avg       0.83      0.84      0.82      9049


Fold accuracies: [0.81021051 0.82711752 0.83255968 0.82327586 0.83372016]
Mean cross-validated accuracy: 0.8254
Tree Size: 35
rootname: workclass
