In [163]:
# Load libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [119]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=adult_columns)
data = data.replace(to_replace= ' ?', value = np.nan)

TARGET = 'Income'

In [120]:
print(data.shape)
data.head()

(32561, 15)


Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [121]:
# Cluster Education to 4 categories: 1)under graduates, 2)high school graduates, 3)some college  and 4)above 

def cluster_education(df):
    df.loc[
        lambda x: x["Education-Num"].between(0, 8, "both"), "Education"
    ] = "under-grad"

    df.loc[
        lambda x: x["Education-Num"] == 9, "Education"
    ] = "HS-grad"

    df.loc[
        lambda x: x["Education-Num"] == 10, "Education"
    ] = "Some-college"

    df.loc[
        lambda x: x["Education-Num"].between(11, 16, 'both'), "Education"
    ] = "above-grad"

cluster_education(data)

display(pd.crosstab(data["Education"], data[TARGET], margins=True))

Income,<=50K,>50K,All
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HS-grad,8826,1675,10501
Some-college,5904,1387,7291
above-grad,5981,4535,10516
under-grad,4009,244,4253
All,24720,7841,32561


In [122]:
# oe = OrdinalEncoder(categories=[[' Preschool',' 1st-4th',' 5th-6th',' 7th-8th',' 9th',' 10th',' 11th', ' 12th',' HS-grad',
#                                  ' Some-college',' Assoc-voc',' Assoc-acdm', ' Bachelors',' Masters',' Prof-school',' Doctorate']])
# data['Education'] = oe.fit_transform(data[['Education']])

# data['Education'].value_counts()


In [123]:
# drop instances with 'Without-pay' and 'Never-worked' values in Workclass

data = data[~data['Workclass'].isin([' Never-worked', ' Without-pay'])]

# data.loc[lambda x: x['Workclass'] != ' Private', 'Workclass'] = 'Other' - does not work good

display(pd.crosstab(data["Workclass"], data[TARGET], margins=True))


Income,<=50K,>50K,All
Workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Federal-gov,589,371,960
Local-gov,1476,617,2093
Private,17733,4963,22696
Self-emp-inc,494,622,1116
Self-emp-not-inc,1817,724,2541
State-gov,945,353,1298
All,23054,7650,30704


In [124]:
# Cluster countries to developed and developing

data.loc[
    lambda x: x["Country"].isin([' Holand-Netherlands', ' Scotland', ' Italy', ' England', ' Ireland', ' Germany', ' Hong',  ' France', ' Taiwan', 
                                 ' Japan', ' Puerto-Rico', ' Canada', ' United-States']), "Country"
] = "Developed"

data.loc[
    lambda x: x["Country"].isin([' Hungary', ' Greece', ' Portugal', ' Poland', ' Yugoslavia', ' Cambodia', ' Iran',  ' Philippines', ' Laos', ' Thailand', ' Vietnam', ' South', 
                                 ' China', ' India', ' Honduras', ' Outlying-US(Guam-USVI-etc)', ' Trinadad&Tobago', ' Ecuador',  ' Philippines', ' Nicaragua',
                                 ' Peru', ' Haiti', ' Columbia', ' Guatemala', ' Dominican-Republic', ' Jamaica',  ' Cuba', ' El-Salvador', ' Mexico']), "Country"
] = "Developing"

display(pd.crosstab(data["Country"], data[TARGET], margins=True))


Income,<=50K,>50K,All
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Developed,22493,7391,29884
Developing,1769,304,2073
All,24262,7695,31957


In [125]:
# cluster Marrital Status to Married and Single

data.loc[
    lambda x: x["Marital Status"].isin([' Married-AF-spouse', ' Married-civ-spouse']), "Marital Status"
] = "Married"
data.loc[
    lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
] = "Single"

display(pd.crosstab(data["Marital Status"], data[TARGET], margins=True))
data.head(1)

Income,<=50K,>50K,All
Marital Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Married,8288,6702,14990
Single,16411,1139,17550
All,24699,7841,32540


Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,above-grad,13,Single,Adm-clerical,Not-in-family,White,Male,2174,0,40,Developed,<=50K


In [126]:
# Cluster those, who have gained or lost some capital together, those who have 0 values in both columns will be grouped to the second cluster
# It actually did not work, so let's left it commented

# data.loc[lambda x: x['Capital Gain'] != 0, 'Capital Gain'] = 1
# data.loc[lambda x: x['Capital Loss'] != 0, 'Capital Loss'] = 1

# data.head(10)

In [127]:
data = data.drop(['Education-Num'], axis = 1)
data = data.dropna(how='any')
data.shape

(30148, 14)

In [128]:
data_quantitative = data.select_dtypes(include=['number'])
cols = data_quantitative.columns

scaler = MinMaxScaler()

for col in cols:
    data_quantitative[col] = scaler.fit_transform(data_quantitative[[col]])

data_quantitative

Unnamed: 0,Age,final weight,Capital Gain,Capital Loss,Hours per week
0,0.301370,0.043338,0.021740,0.0,0.397959
1,0.452055,0.047277,0.000000,0.0,0.122449
2,0.287671,0.137244,0.000000,0.0,0.397959
3,0.493151,0.150212,0.000000,0.0,0.397959
4,0.150685,0.220703,0.000000,0.0,0.397959
...,...,...,...,...,...
32556,0.136986,0.165563,0.000000,0.0,0.377551
32557,0.315068,0.095589,0.000000,0.0,0.397959
32558,0.561644,0.093914,0.000000,0.0,0.397959
32559,0.068493,0.127620,0.000000,0.0,0.193878


In [129]:
list_nominal = ["Workclass", "Marital Status", "Ethnic group", "Sex", "Country"]
data_nominal = pd.get_dummies(data[list_nominal], drop_first=True, dtype=int)
data_nominal

Unnamed: 0,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Marital Status_Single,Ethnic group_ Asian-Pac-Islander,Ethnic group_ Black,Ethnic group_ Other,Ethnic group_ White,Sex_ Male,Country_Developing
0,0,0,0,0,1,1,0,0,0,1,1,0
1,0,0,0,1,0,0,0,0,0,1,1,0
2,0,1,0,0,0,1,0,0,0,1,1,0
3,0,1,0,0,0,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,1,0,0,0,0,0,0,0,1,0,0
32557,0,1,0,0,0,0,0,0,0,1,1,0
32558,0,1,0,0,0,1,0,0,0,1,0,0
32559,0,1,0,0,0,1,0,0,0,1,1,0


In [130]:
# apply ordinal encording to Education-cluster using pandas

scale_mapper = {'under-grad':0, 'Some-college':0.3, 'HS-grad':0.7, 'above-grad':1}
data_ordinal = data["Education"].replace(scale_mapper)
data_ordinal.head()

0    1.0
1    1.0
2    0.7
3    0.0
4    1.0
Name: Education, dtype: float64

In [131]:
data[TARGET] = data[TARGET].replace( {' <=50K':0, ' >50K':1})
data[TARGET]

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: Income, Length: 30148, dtype: int64

In [132]:
# data = pd.concat([data[TARGET], data_quantitative, data_nominal, data_ordinal], axis=1)
data = pd.concat([data[TARGET], data_quantitative, data_ordinal, data_nominal], axis=1)

data.tail(2)

Unnamed: 0,Income,Age,final weight,Capital Gain,Capital Loss,Hours per week,Education,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Marital Status_Single,Ethnic group_ Asian-Pac-Islander,Ethnic group_ Black,Ethnic group_ Other,Ethnic group_ White,Sex_ Male,Country_Developing
32559,0,0.068493,0.12762,0.0,0.0,0.193878,0.7,0,1,0,0,0,1,0,0,0,1,1,0
32560,1,0.479452,0.186383,0.150242,0.0,0.397959,0.7,0,0,1,0,0,0,0,0,0,1,0,0


In [180]:
X_train, X_cv, y_train, y_cv = train_test_split(
    data.drop(columns=[TARGET]), 
    data[TARGET], 
    test_size=0.20, 
    stratify=data[TARGET]
)

In [190]:
# track f1-scores of models:
scores = pd.DataFrame()

# apply random forest classifier

# initialize model
rf_plain = RandomForestClassifier()

def rand_forest_evaluate(model):
    
    model.fit(X_train, y_train)

    # count f1-score on training
    score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')

    # predict on train set
    y_train_pred = model.predict(X_train)

    # predict on test set
    y_cv_pred = model.predict(X_cv)

    # f1-score
    print(f"o F1-score (cross-val) on train set: {np.mean(score)}")

    # classification report
    print("o Classification report on train set:")
    print(classification_report(y_train, y_train_pred))
    print("o Classification report on test set:")

    print(classification_report(y_cv, y_cv_pred))
    
    return score

scores['Plain RF'] = rand_forest_evaluate(rf_plain)

o F1-score (cross-val) on train set: 0.7689049480365044
o Classification report on train set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18112
           1       1.00      1.00      1.00      6006

    accuracy                           1.00     24118
   macro avg       1.00      1.00      1.00     24118
weighted avg       1.00      1.00      1.00     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90      4528
           1       0.70      0.62      0.66      1502

    accuracy                           0.84      6030
   macro avg       0.79      0.77      0.78      6030
weighted avg       0.84      0.84      0.84      6030



This model highly overfits the data. Let's try to tweak RandomForest's hyperparameters to avoid overfitting:

In [141]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Different class weights
class_weight = [{0:1, 1:1}, {0:1, 1:1.5}, {0:1, 1:2}, {0:1, 1:3}]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap, 
               'class_weight': class_weight}

from pprint import pprint
pprint(random_grid)

{'bootstrap': [True, False],
 'class_weight': [{0: 1, 1: 1},
                  {0: 1, 1: 1.5},
                  {0: 1, 1: 2},
                  {0: 1, 1: 3},
                  {0: 1, 1: 5}],
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [144]:
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=random_grid, scoring='f1', n_iter=100, n_jobs = -1, return_train_score=True)

rf_random.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [145]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'class_weight': {0: 1, 1: 2},
 'bootstrap': True}

Now lets train the model again with searche hyperparameters:

In [192]:
rf_rand_search = RandomForestClassifier(n_estimators = 200,
                                        min_samples_split = 5,
                                        min_samples_leaf = 1,
                                        max_depth = 10,
                                        class_weight = {0: 1, 1: 2},
                                        bootstrap = True)

scores['Random Search RF'] = rand_forest_evaluate(rf_rand_search)

o F1-score (cross-val) on train set: 0.7946881394711769
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.93      0.86      0.89     18112
           1       0.66      0.81      0.73      6006

    accuracy                           0.85     24118
   macro avg       0.80      0.83      0.81     24118
weighted avg       0.86      0.85      0.85     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89      4528
           1       0.64      0.80      0.71      1502

    accuracy                           0.84      6030
   macro avg       0.78      0.82      0.80      6030
weighted avg       0.85      0.84      0.84      6030



The model still overfits a little bit. As the first hyperparameres search was quite rough, let's now try to make a more specific one around smaller amount of passible values. This time using the GridSearchCV:


In [171]:
n_estimators = [100, 200]
# Maximum number of levels in tree
max_depth = [7,8,9]
# Minimum number of samples required to split a node
min_samples_split = [4, 5, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]
# Method of selecting samples for training each tree
boorstrap = [True]
# Different class weights
class_weight = [{0:1, 1:1.5}, {0:1, 1:1.6}, {0:1, 1:1.7}, {0:1, 1:1.8}, {0:1, 1:1.9}, {0:1, 1:2}]

# Create the random grid
grid_params = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': boorstrap,
               'class_weight': class_weight}

grid_params

{'n_estimators': [100, 200],
 'max_depth': [7, 8, 9],
 'min_samples_split': [4, 5, 6],
 'min_samples_leaf': [1, 2],
 'bootstrap': [True],
 'class_weight': [{0: 1, 1: 1.5},
  {0: 1, 1: 1.6},
  {0: 1, 1: 1.7},
  {0: 1, 1: 1.8},
  {0: 1, 1: 1.9},
  {0: 1, 1: 2}]}

In [172]:
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=grid_params, scoring='f1', n_jobs = -1, cv=4)
rf_grid_search.fit(X_train, y_train)

In [173]:
rf_grid_search.best_params_

{'bootstrap': True,
 'class_weight': {0: 1, 1: 1.9},
 'max_depth': 9,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'n_estimators': 200}

In [193]:
rf_grid_search = RandomForestClassifier(n_estimators = 200,
                            min_samples_split = 6,
                            min_samples_leaf = 2,
                            max_depth = 9,
                            class_weight = {0: 1, 1: 1.9},
                            bootstrap = True)

scores['Grid Search RF'] = rand_forest_evaluate(rf_grid_search)


o F1-score (cross-val) on train set: 0.7965107987332758
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     18112
           1       0.68      0.75      0.71      6006

    accuracy                           0.85     24118
   macro avg       0.80      0.82      0.81     24118
weighted avg       0.86      0.85      0.85     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90      4528
           1       0.67      0.74      0.71      1502

    accuracy                           0.85      6030
   macro avg       0.79      0.81      0.80      6030
weighted avg       0.85      0.85      0.85      6030



In [194]:
scores

Unnamed: 0,Plain RF,Random Search RF,Grid Search RF
0,0.76382,0.797239,0.797314
1,0.772587,0.795614,0.793751
2,0.775433,0.797395,0.799335
3,0.762998,0.793834,0.796247
4,0.769687,0.789358,0.795908


This model already looks better and does not overfit the data. Futhermore, we can see  around 3% of total f1-score improvement and 4% for the minority class