In [220]:
import numpy as np
import pandas as pd

In [221]:
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex','capital-gain','capital-loss', 'hours-per-week', 'native-country', 'income']
train_df = pd.read_csv('./data/adult.data', names=headers, skipinitialspace=True)

test_df = pd.read_csv('./data/adult.test', names=headers, skipinitialspace=True)

train_df = train_df.drop_duplicates()
test_df = test_df.drop_duplicates()

In [222]:
# replace ? with NAN
train_df[train_df == '?'] = np.nan
test_df[test_df == '?'] = np.nan    

X_train = train_df.drop("income", axis=1)
Y_train = train_df["income"]
X_test = test_df.drop("income", axis=1)
Y_test = test_df["income"].str[:-1]
#occupation 1843
#workclass 1836
#native-country 583

In [223]:
from sklearn import model_selection
from sklearn.impute import KNNImputer
# preprocessing

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

def preprocess(train_data, test_data):
    # categorical to numerical
    features = headers[:-1]
    categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
    numerical = [col for col in features if col not in categorical]
    
    oe = [None] * len(categorical)
    for i, feature in enumerate(categorical):
            oe[i] = LabelEncoder()
            oe[i].fit(pd.concat([train_data[feature], test_data[feature]]))
            train_data[feature] = oe[i].transform(train_data[feature])
            test_data[feature] = oe[i].transform(test_data[feature])

    # impute missing value   
    imputer = KNNImputer(missing_values=np.nan)
    train_data = imputer.fit_transform(train_data)
    test_data = imputer.transform(test_data)

    train_data = pd.DataFrame(train_data, columns = headers[:-1])
    test_data = pd.DataFrame(test_data, columns = headers[:-1])
    
    # inverse transform
    for i, feature in enumerate(categorical):
        train_data[feature] = oe[i].inverse_transform(train_data[feature].astype(int))
        test_data[feature] = oe[i].inverse_transform(test_data[feature].astype(int))
        
    # categorical to one hot
    ohe = OneHotEncoder(sparse_output=False)
    ohe.fit(pd.concat([train_data[categorical],test_data[categorical]]))
    
    oh_headers = ohe.get_feature_names_out(categorical)
    
    train_data_ohe = pd.DataFrame(ohe.transform(train_data[categorical]), columns=oh_headers)    
    test_data_ohe  = pd.DataFrame(ohe.transform(test_data[categorical]), columns=oh_headers)

    train_data.drop(columns=categorical,inplace=True)
    test_data.drop(columns=categorical,inplace=True)
    
    train_data = pd.concat([train_data, train_data_ohe], axis=1) 
    test_data = pd.concat([test_data, test_data_ohe], axis=1)
            
    #standardize numerical features
    scaler = preprocessing.StandardScaler()
    scaler.fit(train_data[numerical])
    train_data[numerical]=scaler.transform(train_data[numerical]) 
    test_data[numerical]=scaler.transform(test_data[numerical])
    
    return train_data, test_data

In [224]:
X_train,X_test = preprocess(X_train, X_test)
X_train

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,native-country_nan
0,0.030390,-1.063569,1.134777,0.148292,-0.216743,-0.035664,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.836973,-1.008668,1.134777,-0.145975,-0.216743,-2.222483,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.042936,0.245040,-0.420679,-0.145975,-0.216743,-0.035664,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.056950,0.425752,-1.198407,-0.145975,-0.216743,-0.035664,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.776193,1.408066,1.134777,-0.145975,-0.216743,-0.035664,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,-0.849519,0.639678,0.745913,-0.145975,-0.216743,-0.197650,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32533,0.103716,-0.335436,-0.420679,-0.145975,-0.216743,-0.035664,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32534,1.423579,-0.358779,-0.420679,-0.145975,-0.216743,-0.035664,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
32535,-1.216148,0.110930,-0.420679,-0.145975,-0.216743,-1.655530,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [254]:
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
pca.explained_variance_ratio_

array([1.49000316e-01, 1.07310976e-01, 9.81836632e-02, 9.67011753e-02,
       8.82783373e-02, 8.05718509e-02, 5.83567535e-02, 2.88681794e-02,
       2.53564623e-02, 2.32822635e-02, 1.86533297e-02, 1.61925125e-02,
       1.58821421e-02, 1.38851155e-02, 1.24101825e-02, 1.09433934e-02,
       1.04929856e-02, 1.00601222e-02, 9.69794352e-03, 9.63529461e-03,
       8.36800824e-03, 8.14282611e-03, 7.34641967e-03, 6.55138804e-03,
       5.61944146e-03, 5.43201352e-03, 4.85507143e-03, 4.45945935e-03,
       4.33241095e-03, 4.16847451e-03, 3.95120067e-03, 3.56369992e-03,
       3.52146290e-03, 3.29546336e-03, 3.22953870e-03, 3.15274045e-03,
       2.96792621e-03, 2.89983420e-03, 2.73638532e-03, 2.47574290e-03,
       2.41086375e-03, 1.83853451e-03, 1.79740663e-03, 1.75362234e-03,
       1.59522826e-03, 1.39254592e-03, 1.36013183e-03, 1.32800047e-03,
       1.29045201e-03, 1.10906889e-03, 8.93724694e-04, 8.31900628e-04,
       5.71575908e-04, 4.88255319e-04, 4.50608163e-04, 3.86932026e-04,
      

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion':['gini', 'entropy', 'log_loss'], 
    'max_depth':[10, 20, 30, 40, 50],
    'min_samples_leaf': [1, 2, 5, 10, 20, 50],
    'max_leaf_nodes': [50, 100, 200, 500, 2000, 4000, 5000]
}

clf = GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
             param_grid=param_grid, scoring='balanced_accuracy')
clf.fit(X_train, Y_train)

In [239]:
bestDTC = clf.best_estimator_
from joblib import dump, load
dump(bestDTC, 'bestDTC.joblib') 

['bestDTC.joblib']

In [251]:
bestDTC = load('bestDTC.joblib') 
Y_pred = bestDTC.predict(X_test)

In [253]:
from sklearn.metrics import balanced_accuracy_score

BAS = balanced_accuracy_score(Y_test, Y_pred)
BAS

0.7808880641629528