In [151]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd

from keras.models import Sequential
from keras.layers import Dense
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from autokeras import StructuredDataClassifier as ml_classifier


In [152]:
data_path = 'D:\pythonProject\dainstudio_tasks\ml_dataset.csv'
data_df = pd.read_csv(data_path)
data_df = data_df.drop(columns='id')
data_df['income'] = data_df.apply(lambda x: int(x['income']=='>50K'),axis=1)
data_df.isnull().mean() * 100 

age               0.0
worklass          0.0
fnlwgt            0.0
education         0.0
education-num     0.0
martial-status    0.0
occupation        0.0
relationship      0.0
race              0.0
sex               0.0
capital-gain      0.0
capital-loss      0.0
hours-per-week    0.0
native-country    0.0
income            0.0
dtype: float64

In [153]:
cols_type_groups = data_df.columns.to_series().groupby(data_df.dtypes).groups
cols_type_groups = {k.name: v for k, v in cols_type_groups.items()}
num_cols = cols_type_groups['int64']
cat_cols = cols_type_groups['object']
num_data = data_df[num_cols]
data_df = pd.get_dummies(data_df, columns=cat_cols)
for key,value in cols_type_groups.items():
    print(key, list(value))

int64 ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'income']
object ['worklass', 'education', 'martial-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']


In [154]:
data_df.corr()[['income']].sort_values(by='income')

Unnamed: 0,income
martial-status_Never-married,-0.318440
relationship_Own-child,-0.228532
sex_Female,-0.215980
relationship_Not-in-family,-0.188497
occupation_Other-service,-0.156348
...,...
age,0.234037
education-num,0.335154
relationship_Husband,0.401035
martial-status_Married-civ-spouse,0.444696


In [155]:
# martial-status_Married-civ-spouse: 0.444696, indicates we need 
# to use tree based estimator 

In [156]:
X = data_df.drop(columns='income')
Y = data_df['income']
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    Y, 
    test_size=test_size, 
    random_state=seed
)

In [157]:
def train_evaluate(model):
    
    model.fit(X_train, y_train)
    test_y_pred = model.predict(X_test)
    train_y_pred = model.predict(X_train)

    print('train:---')
    print(str(classification_report(y_train, train_y_pred)))
    print('######')
    print('test:---')
    print(str(classification_report(y_test, test_y_pred)))
    

In [158]:
# xgboost 
model = XGBClassifier()
train_evaluate(model)

train:---
              precision    recall  f1-score   support

           0       0.92      0.96      0.94     16551
           1       0.86      0.75      0.80      5264

    accuracy                           0.91     21815
   macro avg       0.89      0.85      0.87     21815
weighted avg       0.91      0.91      0.91     21815

######
test:---
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      8169
           1       0.78      0.65      0.71      2577

    accuracy                           0.87     10746
   macro avg       0.84      0.80      0.81     10746
weighted avg       0.87      0.87      0.87     10746



In [159]:
# auto ml
model = ml_classifier(max_trials=5)
train_evaluate(model)

INFO:tensorflow:Reloading Oracle from existing project .\structured_data_classifier\oracle.json
INFO:tensorflow:Reloading Tuner from .\structured_data_classifier\tuner0.json
INFO:tensorflow:Oracle triggered exit
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
INFO:tensorflow:Assets written to: .\structured_data_classifier\best_model\assets
train:---
              precision    recall  f1-score   support

           0       0.87      0.94      0.91     16551
           1       0.75      0.57      0.65      5264

    accuracy                           0.85     21815
   macro avg       0.81      0.75      0.78     21815
weighted avg       0.84      0.85      0.84     21815

######
test:---
              precision    recall  f1-score   support

           0       0.87      0.94      0.90      8169
           1       0.73      0.54      0.62      2577

    ac

In [160]:
def create_model():
    model = Sequential()
    model.add(Dense(108, input_dim=108, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model 

estimator = KerasClassifier(
        build_fn=create_model, 
        epochs=100, 
        batch_size=30, 
        verbose=0
    )

In [161]:
# dl binary classifier
model = estimator
train_evaluate(model)

train:---
              precision    recall  f1-score   support

           0       0.79      0.99      0.88     16551
           1       0.90      0.14      0.25      5264

    accuracy                           0.79     21815
   macro avg       0.84      0.57      0.56     21815
weighted avg       0.81      0.79      0.73     21815

######
test:---
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      8169
           1       0.88      0.14      0.24      2577

    accuracy                           0.79     10746
   macro avg       0.83      0.57      0.56     10746
weighted avg       0.81      0.79      0.73     10746



In [162]:
# In general, xgboost gives the best performance regarding accuracy for 
# both train and test,  and it is only overfitted a little bit.
# auto ml is slightly worse, but both dl and auto ml is a little under fitted.
# dl is sometimes very bad with around 45% accuracy and f1 score, so I think 
# it is not very stable, but both xgboost and auto ml is stable.
