In [78]:
#import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import tree

In [4]:
#load data
df = pd.read_csv('adult.csv')
print(df.shape)
df.head()

(48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [5]:
#data distribution
df['income'].value_counts()

income
<=50K    37155
>50K     11687
Name: count, dtype: int64

In [6]:
#Missing value
df.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [45]:
#scale numeric data
scaler = StandardScaler()
num_cols = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
df_copy = df.copy()
df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])
df_copy.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.995129,Private,0.351675,11th,-1.197259,Never-married,Machine-op-inspct,Own-child,Black,Male,-0.144804,-0.217127,-0.034087,United-States,<=50K
1,-0.046942,Private,-0.945524,HS-grad,-0.419335,Married-civ-spouse,Farming-fishing,Husband,White,Male,-0.144804,-0.217127,0.77293,United-States,<=50K
2,-0.776316,Local-gov,1.394723,Assoc-acdm,0.74755,Married-civ-spouse,Protective-serv,Husband,White,Male,-0.144804,-0.217127,-0.034087,United-States,>50K
3,0.390683,Private,-0.277844,Some-college,-0.030373,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.886874,-0.217127,-0.034087,United-States,>50K
4,-1.505691,?,-0.815954,Some-college,-0.030373,Never-married,?,Own-child,White,Female,-0.144804,-0.217127,-0.841104,United-States,<=50K


In [46]:
#encoder categorical
encoder = OneHotEncoder(sparse_output=False)
cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']

#encoder categorical data
df_encoder = pd.DataFrame(encoder.fit_transform(df_copy[cat_cols]))
#set columns name of categorical columns
df_encoder.columns = encoder.get_feature_names_out(cat_cols)
#Replace categorical data with encoded data
df_copy = df_copy.drop(cat_cols, axis=1)
df_copy = pd.concat([df_encoder, df_copy], axis=1)
#encoder target value
df_copy['income'] = df_copy['income'].apply(lambda x: 1 if x=='>50K' else 0) 
print(df_copy.shape)
df_copy.head()                       

(48842, 109)


Unnamed: 0,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.77293,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-0.776316,1.394723,0.74755,-0.144804,-0.217127,-0.034087,1
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,1
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,0


In [47]:
#Split data
feature = df_copy.drop('income', axis=1)
target = df_copy['income']

X_train, X_test, y_train, y_test = train_test_split(feature, target, random_state=1, test_size=0.2, shuffle=True)
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

Shape of training feature: (39073, 108)
Shape of testing feature: (9769, 108)
Shape of training label: (39073,)
Shape of training label: (9769,)


In [73]:
def evaluation_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print(model)
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test, y_pred))
    print('Score: ',model.score(x_test, y_test))

In [74]:
#Modeling KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [75]:
evaluation_model(knn, X_test, y_test)

KNeighborsClassifier(n_neighbors=3)
[[6612  775]
 [ 935 1447]]
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      7387
           1       0.65      0.61      0.63      2382

    accuracy                           0.82      9769
   macro avg       0.76      0.75      0.76      9769
weighted avg       0.82      0.82      0.82      9769

Score:  0.8249564950353158


In [76]:
#Modeling Logistic Regression
lr = LogisticRegression(solver='lbfgs', max_iter=1000)
lr.fit(X_train, y_train)

In [77]:
evaluation_model(lr, X_test, y_test)

LogisticRegression(max_iter=1000)
[[6887  500]
 [ 917 1465]]
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7387
           1       0.75      0.62      0.67      2382

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.85      0.85      9769

Score:  0.8549493295117208


In [79]:
#Modeling Decision Tree
dtc = tree.DecisionTreeClassifier(random_state=0)
dtc.fit(X_train, y_train)

In [80]:
evaluation_model(dtc, X_test, y_test)

DecisionTreeClassifier(random_state=0)
[[6463  924]
 [ 873 1509]]
              precision    recall  f1-score   support

           0       0.88      0.87      0.88      7387
           1       0.62      0.63      0.63      2382

    accuracy                           0.82      9769
   macro avg       0.75      0.75      0.75      9769
weighted avg       0.82      0.82      0.82      9769

Score:  0.8160507728529021
