In [1]:
# Load libraries

import numpy as np
import pandas as pd
from common_functions import preprocess_data, cluster_categorical
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=adult_columns)
df = df.replace(to_replace= ' ?', value = np.nan)

TARGET = 'Income'

In [3]:
data = df.copy()

In [4]:
print(data.shape)
data.head()

(32561, 15)


Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Cluster Education to 4 categories: 1)under graduates, 2)high school graduates, 3)some college  and 4)above 

def cluster_education(df):
    df.loc[
        lambda x: x["Education-Num"].between(0, 8, "both"), "Education"
    ] = "under-grad"

    df.loc[
        lambda x: x["Education-Num"] == 9, "Education"
    ] = "HS-grad"

    df.loc[
        lambda x: x["Education-Num"] == 10, "Education"
    ] = "Some-college"

    df.loc[
        lambda x: x["Education-Num"].between(11, 16, 'both'), "Education"
    ] = "above-grad"

cluster_education(data)
display(pd.crosstab(data["Education"], data[TARGET], margins=True))

Income,<=50K,>50K,All
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HS-grad,8826,1675,10501
Some-college,5904,1387,7291
above-grad,5981,4535,10516
under-grad,4009,244,4253
All,24720,7841,32561


In [6]:
# apply ordinal encording to Education-cluster

scale_mapper = {'under-grad':0, 'Some-college':1, 'HS-grad':2, 'above-grad':3}
# data_ordinal = data["Education"].replace(scale_mapper)
# data_ordinal.head()
data["Education"] = data["Education"].replace(scale_mapper)
data.sample()

Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
13136,41,Local-gov,173051,1,10,Divorced,Other-service,Unmarried,White,Female,0,0,45,United-States,<=50K


In [7]:
data = data.drop(['Education-Num'], axis = 1)
data = data.dropna(how='any')
data

Unnamed: 0,Age,Workclass,final weight,Education,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,3,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,3,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,2,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,3,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,3,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,2,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,2,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,2,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [8]:
data = cluster_categorical(data)

# drop instances with 'Without-pay' and 'Never-worked' values in Workclass

data = data[~data['Workclass'].isin([' Never-worked', ' Without-pay'])]

display(pd.crosstab(data["Workclass"], data[TARGET], margins=True))

Income,<=50K,>50K,All
Workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Federal-gov,578,365,943
Local-gov,1458,609,2067
Private,17410,4876,22286
Self-emp-inc,474,600,1074
Self-emp-not-inc,1785,714,2499
State-gov,935,344,1279
All,22640,7508,30148


In [9]:
numerical_features_list = ['Age', 'final weight', 'Capital Gain', 'Capital Loss', 'Hours per week']
categorical_features_list = ['Workclass', 'Marital Status', 'Occupation', 'Relationship', 'Ethnic group', 'Sex', 'Country']

In [10]:
data_preprocessed = preprocess_data(data, numerical_features_list,  categorical_features_list, education=False)
data_preprocessed.sample()

Unnamed: 0,Income,stand scaler__Age,stand scaler__final weight,stand scaler__Capital Gain,stand scaler__Capital Loss,stand scaler__Hours per week,onehot__Workclass_ Local-gov,onehot__Workclass_ Private,onehot__Workclass_ Self-emp-inc,onehot__Workclass_ Self-emp-not-inc,...,onehot__Occupation_ Tech-support,onehot__Occupation_ Transport-moving,onehot__Relationship_Not-in-Family,onehot__Ethnic group_ Asian-Pac-Islander,onehot__Ethnic group_ Black,onehot__Ethnic group_ Other,onehot__Ethnic group_ White,onehot__Sex_ Male,onehot__Country_Developing,Education
285,0,0.57633,-1.01002,-0.147449,-0.218639,-0.662589,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3


In [11]:
X_train, X_cv, y_train, y_cv = train_test_split(
    data_preprocessed.drop(columns = [TARGET]), 
    data_preprocessed[TARGET], 
    test_size=0.2, 
    stratify=data_preprocessed.iloc[:, 0]
)

In [12]:
X_train

Unnamed: 0,stand scaler__Age,stand scaler__final weight,stand scaler__Capital Gain,stand scaler__Capital Loss,stand scaler__Hours per week,onehot__Workclass_ Local-gov,onehot__Workclass_ Private,onehot__Workclass_ Self-emp-inc,onehot__Workclass_ Self-emp-not-inc,onehot__Workclass_ State-gov,...,onehot__Occupation_ Tech-support,onehot__Occupation_ Transport-moving,onehot__Relationship_Not-in-Family,onehot__Ethnic group_ Asian-Pac-Islander,onehot__Ethnic group_ Black,onehot__Ethnic group_ Other,onehot__Ethnic group_ White,onehot__Sex_ Male,onehot__Country_Developing,Education
4949,-0.490039,1.430423,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3
3343,-1.023224,1.361684,-0.147449,-0.218639,0.339429,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3
4897,-0.566209,0.123234,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3
29864,0.804838,-0.045050,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2
29681,0.957176,0.673261,-0.147449,-0.218639,0.756936,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19988,-0.642378,0.073414,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3
4625,-0.947055,1.218215,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,3
20958,-0.718547,-0.252072,-0.147449,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2
10005,0.881007,-0.298192,0.271429,-0.218639,-0.078078,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,3


In [13]:
# track f1-scores of models:
scores = pd.DataFrame()

# apply random forest classifier

# initialize model
rf_plain = RandomForestClassifier()

def rand_forest_evaluate(model):
    
    model.fit(X_train, y_train)

    # count f1-score on training
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')

    # predict on train set
    y_train_pred = model.predict(X_train)

    # predict on test set
    y_cv_pred = model.predict(X_cv)

    # f1-score
    print(f"o F1-score (cross-val) on train set: {np.mean(score)}")

    # classification report
    print("o Classification report on train set:")
    print(classification_report(y_train, y_train_pred))
    print("o Classification report on test set:")

    print(classification_report(y_cv, y_cv_pred))
    
    return score

scores['Plain RF'] = rand_forest_evaluate(rf_plain)

o F1-score (cross-val) on train set: 0.7789488932755584
o Classification report on train set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16781
           1       1.00      1.00      1.00      5563

    accuracy                           1.00     22344
   macro avg       1.00      1.00      1.00     22344
weighted avg       1.00      1.00      1.00     22344

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      4196
           1       0.72      0.58      0.64      1391

    accuracy                           0.84      5587
   macro avg       0.79      0.75      0.77      5587
weighted avg       0.83      0.84      0.83      5587



This model highly overfits the data. Let's try to tweak RandomForest's hyperparameters to avoid overfitting:

In [14]:
# Number of trees in the Random Forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Different class weights
class_weight = [{0:1, 1:1}, {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:3}, 'balanced']

# Create a random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap, 
               'class_weight': class_weight}

from pprint import pprint
pprint(random_grid)

{'bootstrap': [True, False],
 'class_weight': [{0: 1, 1: 1},
                  {0: 1, 1: 1.5},
                  {0: 1, 1: 2},
                  {0: 1, 1: 3},
                  'balanced'],
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}


In [15]:
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=random_grid, scoring='f1', n_iter=100, n_jobs = -1, return_train_score=True)

rf_random.fit(X_train, y_train)

In [16]:
rf_random.best_params_

{'n_estimators': 300,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_depth': 10,
 'class_weight': {0: 1, 1: 2},
 'bootstrap': False}

Now lets train the model again with best hyperparameters:

In [17]:
rf_random_search = RandomForestClassifier(n_estimators = 600,
                                        min_samples_split = 5,
                                        min_samples_leaf = 2,
                                        max_depth = 10,
                                        class_weight = {0: 1, 1: 2},
                                        bootstrap = False)

scores['Random Search RF'] = rand_forest_evaluate(rf_random_search)

o F1-score (cross-val) on train set: 0.7879564267802246
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89     16781
           1       0.65      0.80      0.72      5563

    accuracy                           0.84     22344
   macro avg       0.79      0.83      0.80     22344
weighted avg       0.86      0.84      0.85     22344

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.92      0.85      0.88      4196
           1       0.63      0.78      0.70      1391

    accuracy                           0.83      5587
   macro avg       0.77      0.81      0.79      5587
weighted avg       0.85      0.83      0.84      5587



The model still overfits a little bit. As the first hyperparameres search was quite rough, let's now try to make a more specific one around smaller amount of possible values. This time using the GridSearchCV:


In [18]:
# Number of trees in the Random Forest
n_estimators = [550, 600, 650]

# Minimum number of samples required to split a node
min_samples_split = [4, 5, 6]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 3]

# Maximum number of levels in tree
max_depth = [9,10,11]

# Different class weights
class_weight = [{0:1, 1:1.4}, {0:1, 1:1.5}, {0:1, 1:1.6}, {0:1, 1:1.7}, {0:1, 1:1.8}]

# Method of selecting samples for training each tree
boorstrap = [True, False]

# Create a random grid
grid_params = {'bootstrap': boorstrap,
               'class_weight': class_weight,
               'max_depth': max_depth,
               'min_samples_leaf': min_samples_leaf,
               'min_samples_split': min_samples_split,
               'n_estimators': n_estimators
               }

grid_params

{'bootstrap': [True, False],
 'class_weight': [{0: 1, 1: 1.4},
  {0: 1, 1: 1.5},
  {0: 1, 1: 1.6},
  {0: 1, 1: 1.7},
  {0: 1, 1: 1.8}],
 'max_depth': [9, 10, 11],
 'min_samples_leaf': [2, 3],
 'min_samples_split': [4, 5, 6],
 'n_estimators': [550, 600, 650]}

In [19]:
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid_params, scoring='f1', n_jobs = -1, cv=4)
rf_grid_search.fit(X_train, y_train)

In [20]:
rf_grid_search.best_params_

{'bootstrap': False,
 'class_weight': {0: 1, 1: 1.8},
 'max_depth': 11,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 600}

In [21]:
rf_grid_search = RandomForestClassifier(bootstrap = False,
                            class_weight = {0: 1, 1: 1.8},
                            max_depth = 8,
                            min_samples_leaf = 2,
                            min_samples_split = 6,
                            n_estimators = 550)
                            

scores['Grid Search RF'] = rand_forest_evaluate(rf_grid_search)


o F1-score (cross-val) on train set: 0.7900788263755175
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89     16781
           1       0.67      0.73      0.70      5563

    accuracy                           0.84     22344
   macro avg       0.79      0.81      0.80     22344
weighted avg       0.85      0.84      0.84     22344

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4196
           1       0.66      0.72      0.69      1391

    accuracy                           0.84      5587
   macro avg       0.78      0.80      0.79      5587
weighted avg       0.84      0.84      0.84      5587



In [22]:
scores

Unnamed: 0,Plain RF,Random Search RF,Grid Search RF
0,0.76656,0.779645,0.782295
1,0.772214,0.784785,0.782688
2,0.797069,0.795619,0.796993
3,0.783175,0.789068,0.793925
4,0.775727,0.790665,0.794492


This model already looks better and does not overfit the data. Futhermore, we can see  around 3% of total f1-score improvement and 4% for the minority class