In [1]:
# Load libraries

import numpy as np
import pandas as pd
from common_functions import preprocess_data
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=adult_columns)
data = data.replace(to_replace= ' ?', value = np.nan)

TARGET = 'Income'

In [3]:
print(data.shape)
data.head()

(32561, 15)


Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
# Cluster Education to 4 categories: 1)under graduates, 2)high school graduates, 3)some college  and 4)above 

# def cluster_education(df):
#     df.loc[
#         lambda x: x["Education-Num"].between(0, 8, "both"), "Education"
#     ] = "under-grad"

#     df.loc[
#         lambda x: x["Education-Num"] == 9, "Education"
#     ] = "HS-grad"

#     df.loc[
#         lambda x: x["Education-Num"] == 10, "Education"
#     ] = "Some-college"

#     df.loc[
#         lambda x: x["Education-Num"].between(11, 16, 'both'), "Education"
#     ] = "above-grad"

# cluster_education(data)

# display(pd.crosstab(data["Education"], data[TARGET], margins=True))

In [19]:
# drop instances with 'Without-pay' and 'Never-worked' values in Workclass

data = data[~data['Workclass'].isin([' Never-worked', ' Without-pay'])]

display(pd.crosstab(data["Workclass"], data[TARGET], margins=True))


Income,<=50K,>50K,All
Workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Federal-gov,578,365,943
Local-gov,1458,609,2067
Private,17410,4876,22286
Self-emp-inc,474,600,1074
Self-emp-not-inc,1785,714,2499
State-gov,935,344,1279
All,22640,7508,30148


In [6]:
# Cluster countries to developed and developing

data.loc[
    lambda x: x["Country"].isin([' Holand-Netherlands', ' Scotland', ' Italy', ' England', ' Ireland', ' Germany', ' Hong',  ' France', ' Taiwan', 
                                 ' Japan', ' Puerto-Rico', ' Canada', ' United-States']), "Country"
] = "Developed"

data.loc[
    lambda x: x["Country"].isin([' Hungary', ' Greece', ' Portugal', ' Poland', ' Yugoslavia', ' Cambodia', ' Iran',  ' Philippines', ' Laos', ' Thailand', ' Vietnam', ' South', 
                                 ' China', ' India', ' Honduras', ' Outlying-US(Guam-USVI-etc)', ' Trinadad&Tobago', ' Ecuador',  ' Philippines', ' Nicaragua',
                                 ' Peru', ' Haiti', ' Columbia', ' Guatemala', ' Dominican-Republic', ' Jamaica',  ' Cuba', ' El-Salvador', ' Mexico']), "Country"
] = "Developing"

display(pd.crosstab(data["Country"], data[TARGET], margins=True))


Income,<=50K,>50K,All
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Developed,22493,7391,29884
Developing,1769,304,2073
All,24262,7695,31957


In [9]:
data = data.drop(['Education-Num'], axis = 1)
data = data.dropna(how='any')
data.shape

(30148, 14)

In [10]:
data_quantitative = data.select_dtypes(include=['number'])
numerical_features_list = data_quantitative.columns
categorical_features_list = ["Workclass", "Marital Status", "Ethnic group", "Sex", "Country"]


In [11]:
data

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,Developed,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,Developed,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,Developed,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,Developed,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Developing,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,Developed,<=50K
32557,40,Private,154374,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,Developed,>50K
32558,58,Private,151910,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,Developed,<=50K
32559,22,Private,201490,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,Developed,<=50K


In [12]:
data_preprocessed = preprocess_data(data, numerical_features_list,  categorical_features_list)
data_preprocessed

Unnamed: 0,Income,ordinal__Education,stand scaler__Age,stand scaler__final weight,stand scaler__Capital Gain,stand scaler__Capital Loss,stand scaler__Hours per week,onehot__Workclass_ Local-gov,onehot__Workclass_ Private,onehot__Workclass_ Self-emp-inc,...,onehot__Marital Status_ Married-spouse-absent,onehot__Marital Status_ Never-married,onehot__Marital Status_ Separated,onehot__Marital Status_ Widowed,onehot__Ethnic group_ Asian-Pac-Islander,onehot__Ethnic group_ Black,onehot__Ethnic group_ Other,onehot__Ethnic group_ White,onehot__Sex_ Male,onehot__Country_Developing
0,0,12.0,0.043145,-1.062699,0.146022,-0.218639,-0.078078,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0,12.0,0.881007,-1.007853,-0.147449,-0.218639,-2.332618,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0,8.0,-0.033024,0.244604,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0,6.0,1.109515,0.425136,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0,12.0,-0.794716,1.406469,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30143,0,11.0,-0.870886,0.638849,-0.147449,-0.218639,-0.245081,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30144,1,8.0,0.119314,-0.335292,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
30145,0,8.0,1.490361,-0.358612,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
30146,0,8.0,-1.251732,0.110628,-0.147449,-0.218639,-1.748108,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [13]:
# apply ordinal encording to Education-cluster using pandas

# scale_mapper = {'under-grad':0, 'Some-college':0.3, 'HS-grad':0.7, 'above-grad':1}
# data_ordinal = data["Education"].replace(scale_mapper)
# data_ordinal.head()

In [16]:
X_train, X_cv, y_train, y_cv = train_test_split(
    data_preprocessed.drop(columns = [TARGET]), 
    data_preprocessed[TARGET], 
    test_size=0.20, 
    stratify=data_preprocessed.iloc[:, 0]
)

In [17]:
# track f1-scores of models:
scores = pd.DataFrame()

# apply random forest classifier

# initialize model
rf_plain = RandomForestClassifier()

def rand_forest_evaluate(model):
    
    model.fit(X_train, y_train)

    # count f1-score on training
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')

    # predict on train set
    y_train_pred = model.predict(X_train)

    # predict on test set
    y_cv_pred = model.predict(X_cv)

    # f1-score
    print(f"o F1-score (cross-val) on train set: {np.mean(score)}")

    # classification report
    print("o Classification report on train set:")
    print(classification_report(y_train, y_train_pred))
    print("o Classification report on test set:")

    print(classification_report(y_cv, y_cv_pred))
    
    return score

scores['Plain RF'] = rand_forest_evaluate(rf_plain)

o F1-score (cross-val) on train set: 0.7832822692502658
o Classification report on train set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18112
           1       1.00      1.00      1.00      6006

    accuracy                           1.00     24118
   macro avg       1.00      1.00      1.00     24118
weighted avg       1.00      1.00      1.00     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      4528
           1       0.71      0.60      0.65      1502

    accuracy                           0.84      6030
   macro avg       0.79      0.76      0.77      6030
weighted avg       0.83      0.84      0.83      6030



This model highly overfits the data. Let's try to tweak RandomForest's hyperparameters to avoid overfitting:

In [18]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Different class weights
class_weight = [{0:1, 1:1}, {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:3}]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap, 
               'class_weight': class_weight}

from pprint import pprint
pprint(random_grid)

{'bootstrap': [True, False],
 'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 1.5}, {0: 1, 1: 2}, {0: 1, 1: 3}],
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}


In [15]:
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=random_grid, scoring='f1', n_iter=100, n_jobs = -1, return_train_score=True)

rf_random.fit(X_train, y_train)

In [16]:
rf_random.best_params_

{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_depth': 10,
 'class_weight': {0: 1, 1: 2},
 'bootstrap': True}

Now lets train the model again with searche hyperparameters:

In [17]:
rf_rand_search = RandomForestClassifier(n_estimators = 200,
                                        min_samples_split = 5,
                                        min_samples_leaf = 1,
                                        max_depth = 10,
                                        class_weight = {0: 1, 1: 2},
                                        bootstrap = True)

scores['Random Search RF'] = rand_forest_evaluate(rf_rand_search)

o F1-score (cross-val) on train set: 0.7974647611016495
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90     18112
           1       0.69      0.78      0.73      6006

    accuracy                           0.86     24118
   macro avg       0.81      0.83      0.82     24118
weighted avg       0.86      0.86      0.86     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      4528
           1       0.65      0.75      0.70      1502

    accuracy                           0.84      6030
   macro avg       0.78      0.81      0.79      6030
weighted avg       0.85      0.84      0.84      6030



The model still overfits a little bit. As the first hyperparameres search was quite rough, let's now try to make a more specific one around smaller amount of passible values. This time using the GridSearchCV:


In [18]:
n_estimators = [100, 200]
# Maximum number of levels in tree
max_depth = [7,8,9]
# Minimum number of samples required to split a node
min_samples_split = [4, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
boorstrap = [True]
# Different class weights
class_weight = [{0:1, 1:1.5}, {0:1, 1:1.6}, {0:1, 1:1.7}, {0:1, 1:1.8}, {0:1, 1:1.9}, {0:1, 1:2}]

# Create the random grid
grid_params = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': boorstrap,
               'class_weight': class_weight}

grid_params

{'n_estimators': [100, 200],
 'max_depth': [7, 8, 9],
 'min_samples_split': [4, 5, 6],
 'min_samples_leaf': [1, 2],
 'bootstrap': [True],
 'class_weight': [{0: 1, 1: 1.5},
  {0: 1, 1: 1.6},
  {0: 1, 1: 1.7},
  {0: 1, 1: 1.8},
  {0: 1, 1: 1.9},
  {0: 1, 1: 2}]}

In [19]:
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid_params, scoring='f1', n_jobs = -1, cv=4)
rf_grid_search.fit(X_train, y_train)

In [20]:
rf_grid_search.best_params_

{'bootstrap': True,
 'class_weight': {0: 1, 1: 1.9},
 'max_depth': 9,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 200}

In [21]:
rf_grid_search = RandomForestClassifier(n_estimators = 200,
                            min_samples_split = 6,
                            min_samples_leaf = 2,
                            max_depth = 9,
                            class_weight = {0: 1, 1: 1.9},
                            bootstrap = True)

scores['Grid Search RF'] = rand_forest_evaluate(rf_grid_search)


o F1-score (cross-val) on train set: 0.799732058586643
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.92      0.89      0.90     18112
           1       0.69      0.76      0.72      6006

    accuracy                           0.85     24118
   macro avg       0.80      0.82      0.81     24118
weighted avg       0.86      0.85      0.86     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      4528
           1       0.66      0.74      0.69      1502

    accuracy                           0.84      6030
   macro avg       0.78      0.80      0.79      6030
weighted avg       0.85      0.84      0.84      6030



In [22]:
scores

Unnamed: 0,Plain RF,Random Search RF,Grid Search RF
0,0.784116,0.801024,0.799552
1,0.789244,0.79557,0.799375
2,0.772212,0.797498,0.796246
3,0.774631,0.786646,0.796589
4,0.790997,0.806585,0.806899


This model already looks better and does not overfit the data. Futhermore, we can see  around 3% of total f1-score improvement and 4% for the minority class