In [117]:
# Load libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

In [118]:
adult_columns = [
    "Age",
    "Workclass",
    "final weight",
    "Education",
    "Education-Num",
    "Marital Status",
    "Occupation",
    "Relationship",
    "Ethnic group",
    "Sex",
    "Capital Gain",
    "Capital Loss",
    "Hours per week",
    "Country",
    "Income",
]

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=adult_columns)
data = data.replace(to_replace= ' ?', value = np.nan)

TARGET = 'Income'

In [119]:
print(data.shape)
data.head()

(32561, 15)


Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [120]:
# Cluster Education to 4 categories: 1)under graduates, 2)high school graduates, 3)some college  and 4)above 

def cluster_education(df):
    df.loc[
        lambda x: x["Education-Num"].between(0, 8, "both"), "Education"
    ] = "under-grad"

    df.loc[
        lambda x: x["Education-Num"] == 9, "Education"
    ] = "HS-grad"

    df.loc[
        lambda x: x["Education-Num"] == 10, "Education"
    ] = "Some-college"

    df.loc[
        lambda x: x["Education-Num"].between(11, 16, 'both'), "Education"
    ] = "above-grad"

cluster_education(data)

display(pd.crosstab(data["Education"], data[TARGET], margins=True))

Income,<=50K,>50K,All
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HS-grad,8826,1675,10501
Some-college,5904,1387,7291
above-grad,5981,4535,10516
under-grad,4009,244,4253
All,24720,7841,32561


In [121]:
# drop instances with 'Without-pay' and 'Never-worked' values in Workclass

data = data[~data['Workclass'].isin([' Never-worked', ' Without-pay'])]

# data.loc[lambda x: x['Workclass'] != ' Private', 'Workclass'] = 'Other' - does not work good

display(pd.crosstab(data["Workclass"], data[TARGET], margins=True))


Income,<=50K,>50K,All
Workclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Federal-gov,589,371,960
Local-gov,1476,617,2093
Private,17733,4963,22696
Self-emp-inc,494,622,1116
Self-emp-not-inc,1817,724,2541
State-gov,945,353,1298
All,23054,7650,30704


In [122]:
# Cluster countries to developed and developing

data.loc[
    lambda x: x["Country"].isin([' Holand-Netherlands', ' Scotland', ' Italy', ' England', ' Ireland', ' Germany', ' Hong',  ' France', ' Taiwan', 
                                 ' Japan', ' Puerto-Rico', ' Canada', ' United-States']), "Country"
] = "Developed"

data.loc[
    lambda x: x["Country"].isin([' Hungary', ' Greece', ' Portugal', ' Poland', ' Yugoslavia', ' Cambodia', ' Iran',  ' Philippines', ' Laos', ' Thailand', ' Vietnam', ' South', 
                                 ' China', ' India', ' Honduras', ' Outlying-US(Guam-USVI-etc)', ' Trinadad&Tobago', ' Ecuador',  ' Philippines', ' Nicaragua',
                                 ' Peru', ' Haiti', ' Columbia', ' Guatemala', ' Dominican-Republic', ' Jamaica',  ' Cuba', ' El-Salvador', ' Mexico']), "Country"
] = "Developing"

display(pd.crosstab(data["Country"], data[TARGET], margins=True))


Income,<=50K,>50K,All
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Developed,22493,7391,29884
Developing,1769,304,2073
All,24262,7695,31957


In [123]:
# cluster Marrital Status to Married and Single

data.loc[
    lambda x: x["Marital Status"].isin([' Married-AF-spouse', ' Married-civ-spouse']), "Marital Status"
] = "Married"
data.loc[
    lambda x: x["Marital Status"].isin([' Widowed', ' Separated', ' Married-spouse-absent', ' Never-married', ' Divorced']), "Marital Status"
] = "Single"

display(pd.crosstab(data["Marital Status"], data[TARGET], margins=True))
data.head(1)

Income,<=50K,>50K,All
Marital Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Married,8288,6702,14990
Single,16411,1139,17550
All,24699,7841,32540


Unnamed: 0,Age,Workclass,final weight,Education,Education-Num,Marital Status,Occupation,Relationship,Ethnic group,Sex,Capital Gain,Capital Loss,Hours per week,Country,Income
0,39,State-gov,77516,above-grad,13,Single,Adm-clerical,Not-in-family,White,Male,2174,0,40,Developed,<=50K


In [124]:
# Cluster those, who have gained or lost some capital together, those who have 0 values in both columns will be grouped to the second cluster
# It actually did not work, so let's left it commented

# data.loc[lambda x: x['Capital Gain'] != 0, 'Capital Gain'] = 1
# data.loc[lambda x: x['Capital Loss'] != 0, 'Capital Loss'] = 1

# data.head(10)

In [125]:
data = data.drop(['Education-Num'], axis = 1)
data = data.dropna(how='any')
data.shape

(30148, 14)

In [126]:
data_quantitative = data.select_dtypes(include=['number'])
cols = data_quantitative.columns

scaler = MinMaxScaler()

for col in cols:
    data_quantitative[col] = scaler.fit_transform(data_quantitative[[col]])

data_quantitative

Unnamed: 0,Age,final weight,Capital Gain,Capital Loss,Hours per week
0,0.301370,0.043338,0.021740,0.0,0.397959
1,0.452055,0.047277,0.000000,0.0,0.122449
2,0.287671,0.137244,0.000000,0.0,0.397959
3,0.493151,0.150212,0.000000,0.0,0.397959
4,0.150685,0.220703,0.000000,0.0,0.397959
...,...,...,...,...,...
32556,0.136986,0.165563,0.000000,0.0,0.377551
32557,0.315068,0.095589,0.000000,0.0,0.397959
32558,0.561644,0.093914,0.000000,0.0,0.397959
32559,0.068493,0.127620,0.000000,0.0,0.193878


In [127]:
list_nominal = ["Workclass", "Marital Status", "Ethnic group", "Sex", "Country"]
data_nominal = pd.get_dummies(data[list_nominal], drop_first=True)
data_nominal

Unnamed: 0,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Marital Status_Single,Ethnic group_ Asian-Pac-Islander,Ethnic group_ Black,Ethnic group_ Other,Ethnic group_ White,Sex_ Male,Country_Developing
0,0,0,0,0,1,1,0,0,0,1,1,0
1,0,0,0,1,0,0,0,0,0,1,1,0
2,0,1,0,0,0,1,0,0,0,1,1,0
3,0,1,0,0,0,0,0,1,0,0,1,0
4,0,1,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,1,0,0,0,0,0,0,0,1,0,0
32557,0,1,0,0,0,0,0,0,0,1,1,0
32558,0,1,0,0,0,1,0,0,0,1,0,0
32559,0,1,0,0,0,1,0,0,0,1,1,0


In [128]:
# apply ordinal encording to Education-cluster using pandas
scale_mapper = {'under-grad':0, 'Some-college':0.333, 'HS-grad':0.666, 'above-grad':1}
data_ordinal = data["Education"].replace(scale_mapper)
data_ordinal.head()

0    1.000
1    1.000
2    0.666
3    0.000
4    1.000
Name: Education, dtype: float64

In [129]:
data[TARGET] = data[TARGET].replace( {' <=50K':0, ' >50K':1})
data[TARGET]

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: Income, Length: 30148, dtype: int64

In [130]:
data = pd.concat([data[TARGET], data_quantitative, data_nominal, data_ordinal], axis=1)
data.tail(2)

Unnamed: 0,Income,Age,final weight,Capital Gain,Capital Loss,Hours per week,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Marital Status_Single,Ethnic group_ Asian-Pac-Islander,Ethnic group_ Black,Ethnic group_ Other,Ethnic group_ White,Sex_ Male,Country_Developing,Education
32559,0,0.068493,0.12762,0.0,0.0,0.193878,0,1,0,0,0,1,0,0,0,1,1,0,0.666
32560,1,0.479452,0.186383,0.150242,0.0,0.397959,0,0,1,0,0,0,0,0,0,1,0,0,0.666


In [135]:
X_train, X_cv, y_train, y_cv = train_test_split(
    data.drop(columns=[TARGET]), 
    data[TARGET], 
    test_size=0.20, 
    stratify=data[TARGET]
)

In [136]:
# apply random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# initialize model
rf = RandomForestClassifier(max_depth=8, class_weight = {0:1, 1:1.5})

# fit model
rf.fit(X_train, y_train)

# count f1-score on training
score = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=5, scoring='f1_macro')

# predict on train set
y_train_pred = rf.predict(X_train)

# predict on test set
y_cv_pred = rf.predict(X_cv)

# evaluate model

# accuracy
print(f"o Accuracy on train set: {accuracy_score(y_train, y_train_pred):.2f}")
print(f"o Accuracy on test set: {accuracy_score(y_cv, y_cv_pred):.2f}")

# f1-score
print(f"o F1-score (cross-val) on train set: {np.mean(score)}")

# confusion matrix
print("o Confusion matrix on train set:")
print(confusion_matrix(y_train, y_train_pred))
print("o Confusion matrix on test set:")

print(confusion_matrix(y_cv, y_cv_pred))

# classification report
print("o Classification report on train set:")
print(classification_report(y_train, y_train_pred))
print("o Classification report on test set:")

print(classification_report(y_cv, y_cv_pred))


o Accuracy on train set: 0.85
o Accuracy on test set: 0.85
o F1-score (cross-val) on train set: 0.7715685261938703
o Confusion matrix on train set:
[[16780  1332]
 [ 2209  3797]]
o Confusion matrix on test set:
[[4193  335]
 [ 552  950]]
o Classification report on train set:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     18112
           1       0.74      0.63      0.68      6006

    accuracy                           0.85     24118
   macro avg       0.81      0.78      0.79     24118
weighted avg       0.85      0.85      0.85     24118

o Classification report on test set:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90      4528
           1       0.74      0.63      0.68      1502

    accuracy                           0.85      6030
   macro avg       0.81      0.78      0.79      6030
weighted avg       0.85      0.85      0.85      6030

