In [299]:
import time

import numpy as np
import pandas as pd

In [300]:
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex','capital-gain','capital-loss', 'hours-per-week', 'native-country', 'income']
train_df = pd.read_csv('./data/adult.data', names=headers, skipinitialspace=True)

test_df = pd.read_csv('./data/adult.test', names=headers, skipinitialspace=True)

train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

In [301]:
# replace ? with NAN
train_df[train_df == '?'] = np.nan
test_df[test_df == '?'] = np.nan    

X_train = train_df.drop("income", axis=1)
Y_train = train_df["income"]
X_test = test_df.drop("income", axis=1)
Y_test = test_df["income"].str[:-1]
#occupation 1843
#workclass 1836
#native-country 583

In [302]:
from sklearn import model_selection
from sklearn.impute import KNNImputer
# preprocessing

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

def preprocess(train_data, test_data):
    # categorical to numerical
    features = headers[:-1]
    categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
    numerical = [col for col in features if col not in categorical]
    
    oe = [None] * len(categorical)
    for i, feature in enumerate(categorical):
            oe[i] = LabelEncoder()
            oe[i].fit(pd.concat([train_data[feature], test_data[feature]]))
            train_data[feature] = oe[i].transform(train_data[feature])
            test_data[feature] = oe[i].transform(test_data[feature])

    # impute missing value   
    imputer = KNNImputer(missing_values=np.nan)
    train_data = imputer.fit_transform(train_data)
    test_data = imputer.transform(test_data)

    train_data = pd.DataFrame(train_data, columns = headers[:-1])
    test_data = pd.DataFrame(test_data, columns = headers[:-1])
    
    # inverse transform
    for i, feature in enumerate(categorical):
        train_data[feature] = oe[i].inverse_transform(train_data[feature].astype(int))
        test_data[feature] = oe[i].inverse_transform(test_data[feature].astype(int))
        
    # categorical to one hot
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(pd.concat([train_data[categorical],test_data[categorical]]))
    
    oh_headers = ohe.get_feature_names_out(categorical)
    
    train_data_ohe = pd.DataFrame(ohe.transform(train_data[categorical]), columns=oh_headers)    
    test_data_ohe  = pd.DataFrame(ohe.transform(test_data[categorical]), columns=oh_headers)

    train_data.drop(columns=categorical,inplace=True)
    test_data.drop(columns=categorical,inplace=True)
    
    train_data = pd.concat([train_data, train_data_ohe], axis=1) 
    test_data = pd.concat([test_data, test_data_ohe], axis=1)
            
    #standardize numerical features
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_data[numerical])
    train_data[numerical]=scaler.transform(train_data[numerical]) 
    test_data[numerical]=scaler.transform(test_data[numerical])
    
    return train_data, test_data

In [303]:
X_train,X_test = preprocess(X_train, X_test)

In [304]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, Y_train)
tree = dtc.tree_
print(tree.max_depth, tree.n_leaves)

48 4666


In [305]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion':['gini', 'entropy', 'log_loss'], 
    'max_depth':[10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [1, 2, 5, 10, 20, 50],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [50, 100, 200, 500, 2000, 5000],
    'random_state': [None, 42]
}

DTC_CV = GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid=param_grid, scoring='balanced_accuracy')
start_tuning = time.time()
DTC_CV.fit(X_train, Y_train)
end_tuning = time.time()
DTC_CV.best_params_

{'criterion': 'entropy',
 'max_depth': 20,
 'max_features': None,
 'max_leaf_nodes': 100,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'random_state': None}

In [306]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

bestDTC = DTC_CV.best_estimator_
start_training = time.time()
bestDTC.fit(X_train, Y_train)
end_training = time.time()

Y_pred = bestDTC.predict(X_test)
end_test = time.time()
BAS = balanced_accuracy_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print("BestDTC tuning time: ", end_tuning - start_tuning)
print("BestDTC training time: ", end_training - start_training)
print("BestDTC test time: ", end_test - end_training)
print("balanced accuracy: ", BAS)
print("accuracy: ", accuracy)

from joblib import dump
dump(bestDTC, 'bestDTC.joblib') 

BestDTC tuning time:  14795.254892587662
BestDTC training time:  0.5203840732574463
balanced accuracy:  0.7694522084986376
accuracy:  0.8609609240599656


['bestDTC.joblib']

In [None]:
from sklearn.svm import SVC

param_grid = {
    'kernel':['rbf', 'poly', 'sigmoid'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1, 10, 100]
}

SVC_CV = GridSearchCV(estimator=SVC(),
             param_grid=param_grid, scoring='balanced_accuracy')
start_tuning = time.time()
SVC_CV.fit(X_train, Y_train)
end_tuning = time.time()
SVC_CV.best_params_

In [None]:
bestSVC = SVC_CV.best_estimator_
start_training = time.time()
bestSVC.fit(X_train, Y_train)
end_training = time.time()

Y_pred = bestSVC.predict(X_test)
end_test = time.time()
BAS = balanced_accuracy_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print("BestSVC tuning time: ", end_tuning - start_tuning)
print("BestSCC training time: ", end_training - start_training)
print("BestSVC test time: ", end_test - end_training)
print("balanced accuracy: ", BAS)
print("accuracy: ", accuracy)

from joblib import dump
dump(bestDTC, 'bestSVC.joblib') 