In [19]:
import time

import numpy as np
import pandas as pd

In [20]:
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex','capital-gain','capital-loss', 'hours-per-week', 'native-country', 'income']
train_df = pd.read_csv('./data/adult.data', names=headers, skipinitialspace=True)

test_df = pd.read_csv('./data/adult.test', names=headers, skipinitialspace=True)

train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

In [21]:
# replace ? with NAN
train_df[train_df == '?'] = np.nan
test_df[test_df == '?'] = np.nan    

X_train = train_df.drop("income", axis=1)
Y_train = train_df["income"]
X_test = test_df.drop("income", axis=1)
Y_test = test_df["income"].str[:-1]
categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
X_train.info()
#occupation 1843
#workclass 1836
#native-country 583

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32537 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   workclass       30701 non-null  object
 2   fnlwgt          32537 non-null  int64 
 3   education       32537 non-null  object
 4   education-num   32537 non-null  int64 
 5   marital-status  32537 non-null  object
 6   occupation      30694 non-null  object
 7   relationship    32537 non-null  object
 8   race            32537 non-null  object
 9   sex             32537 non-null  object
 10  capital-gain    32537 non-null  int64 
 11  capital-loss    32537 non-null  int64 
 12  hours-per-week  32537 non-null  int64 
 13  native-country  31955 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.7+ MB


In [22]:
from sklearn import model_selection
from sklearn.impute import KNNImputer
# preprocessing

from sklearn import preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

def preprocess(train_data, test_data):
    # categorical to numerical
    features = headers[:-1]
    categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
    numerical = [col for col in features if col not in categorical]
    
    oe = OrdinalEncoder()
    oe.fit(pd.concat([train_data[categorical],test_data[categorical]]))

    train_data_oe = pd.DataFrame(oe.transform(train_data[categorical]), columns=categorical)    
    test_data_oe  = pd.DataFrame(oe.transform(test_data[categorical]), columns=categorical)

    train_data_oe.index = train_data.index
    test_data_oe.index  = test_data.index
    train_data.drop(columns=categorical,inplace=True)
    test_data.drop(columns=categorical,inplace=True)
    train_data = pd.concat([train_data, train_data_oe], axis=1) 
    test_data = pd.concat([test_data, test_data_oe], axis=1)

    
    # impute missing value   
    imputer = KNNImputer(missing_values=np.nan)
    train_data = imputer.fit_transform(train_data)
    test_data = imputer.transform(test_data)
    
    imputed_headers = imputer.get_feature_names_out()

    train_data = pd.DataFrame(train_data, columns=imputed_headers)
    test_data = pd.DataFrame(test_data, columns=imputed_headers)

    # inverse transform
    train_data_cat = pd.DataFrame(oe.inverse_transform(train_data[categorical]), columns=categorical) 
    test_data_cat = pd.DataFrame(oe.inverse_transform(test_data[categorical]), columns=categorical) 
    
    train_data_cat.index = train_data.index
    test_data_cat.index = test_data.index   
    train_data.drop(columns=categorical,inplace=True)
    test_data.drop(columns=categorical,inplace=True)
    train_data = pd.concat([train_data, train_data_cat], axis=1) 
    test_data = pd.concat([test_data, test_data_cat], axis=1)

    # categorical to one hot
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(pd.concat([train_data[categorical],test_data[categorical]]))

    oh_headers = ohe.get_feature_names_out()

    train_data_ohe = pd.DataFrame(ohe.transform(train_data[categorical]), columns=oh_headers)    
    test_data_ohe  = pd.DataFrame(ohe.transform(test_data[categorical]), columns=oh_headers)
    
    train_data_ohe.index = train_data.index
    test_data_ohe.index  = test_data.index
    train_data.drop(columns=categorical,inplace=True)
    test_data.drop(columns=categorical,inplace=True)

    train_data = pd.concat([train_data, train_data_ohe], axis=1) 
    test_data = pd.concat([test_data, test_data_ohe], axis=1)

    #standardize numerical features
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_data[numerical])
    train_data[numerical] = scaler.transform(train_data[numerical])
    test_data[numerical] = scaler.transform(test_data[numerical])
    # 
    # train_data_num.index = train_data.index
    # test_data_num.index = test_data.index   
    # train_data.drop(columns=numerical,inplace=True)
    # test_data.drop(columns=numerical,inplace=True)
    # train_data = pd.concat([train_data, train_data_num], axis=1) 
    # test_data = pd.concat([test_data, test_data_num], axis=1)
    
    return train_data, test_data

In [23]:
X_train,X_test = preprocess(X_train, X_test)

In [24]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, Y_train)
tree = dtc.tree_
print(tree.max_depth, tree.n_leaves)

47 4681


In [25]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion':['gini', 'entropy', 'log_loss'], 
    'max_depth':[10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [1, 2, 5, 10, 20, 50],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [50, 100, 200, 500, 2000, 5000],
    'random_state': [None, 42]
}

DTC_CV = GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid=param_grid, scoring='balanced_accuracy')
start_tuning = time.time()
DTC_CV.fit(X_train, Y_train)
end_tuning = time.time()
DTC_CV.best_params_

{'criterion': 'entropy',
 'max_depth': 20,
 'max_features': None,
 'max_leaf_nodes': 100,
 'min_samples_leaf': 1,
 'min_samples_split': 20,
 'random_state': 42}

In [26]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

bestDTC = DTC_CV.best_estimator_
start_training = time.time()
bestDTC.fit(X_train, Y_train)
end_training = time.time()

Y_pred = bestDTC.predict(X_test)
end_test = time.time()
BAS = balanced_accuracy_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print("BestDTC tuning time: ", end_tuning - start_tuning)
print("BestDTC training time: ", end_training - start_training)
print("BestDTC test time: ", end_test - end_training)
print("balanced accuracy: ", BAS)
print("accuracy: ", accuracy)

from joblib import dump
dump(bestDTC, 'bestDTC.joblib') 

BestDTC tuning time:  12323.356417417526
BestDTC training time:  0.3066389560699463
BestDTC test time:  0.022972822189331055
balanced accuracy:  0.7815567489956236
accuracy:  0.861759646104694


['bestDTC.joblib']

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score, accuracy_score

param_grid = {
    'n_neighbors':[1, 2, 5, 10], 
    'weights': ['uniform', 'distance'],
    'p': [1, 2, 3]
}

# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# Y_train = le.fit_transform(Y_train)
# Y_test = le.transform(Y_test)

KNN_CV = GridSearchCV(estimator=KNeighborsClassifier(n_jobs=-1),
             param_grid=param_grid, scoring='balanced_accuracy')
start_tuning = time.time()
KNN_CV.fit(X_train, Y_train)
end_tuning = time.time()
KNN_CV.best_params_

{'n_neighbors': 10, 'p': 3, 'weights': 'distance'}

In [29]:
bestKNN = KNN_CV.best_estimator_
start_training = time.time()
bestKNN.fit(X_train, Y_train)
end_training = time.time()

Y_pred = bestKNN.predict(X_test)
end_test = time.time()
BAS = balanced_accuracy_score(Y_test, Y_pred)
accuracy = accuracy_score(Y_test, Y_pred)
print("bestKNN tuning time: ", end_tuning - start_tuning)
print("bestKNN training time: ", end_training - start_training)
print("bestKNN test time: ", end_test - end_training)
print("balanced accuracy: ", BAS)
print("accuracy: ", accuracy)

from joblib import dump
dump(bestKNN, 'bestKNN.joblib') 

bestKNN tuning time:  807.252690076828
bestKNN training time:  0.03091144561767578
bestKNN test time:  55.24993324279785
balanced accuracy:  0.7578907194904048
accuracy:  0.8382280658638486


['bestKNN.joblib']