In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('./income.csv')
df.head(2)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income >50K
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0


In [7]:
df['income >50K'].value_counts() / len(df) * 100

income >50K
0    75.919044
1    24.080956
Name: count, dtype: float64

In [8]:
df.shape

(32561, 14)

In [9]:
df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
df['occupation'] = df['occupation'].fillna(df['occupation'].mode()[0])
df['native-country'] = df['native-country'].fillna(df['native-country'].mode()[0])

In [10]:
df.isnull().sum()

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income >50K       0
dtype: int64

In [11]:
df = pd.get_dummies(df, columns=['workclass',
 'education',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native-country'], drop_first=True, dtype=int)
df.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income >50K,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_11th,education_12th,education_1st-4th,education_5th-6th,education_7th-8th,education_9th,education_Assoc-acdm,education_Assoc-voc,education_Bachelors,education_Doctorate,education_HS-grad,education_Masters,education_Preschool,education_Prof-school,education_Some-college,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,...,native-country_Canada,native-country_China,native-country_Columbia,native-country_Cuba,native-country_Dominican-Republic,native-country_Ecuador,native-country_El-Salvador,native-country_England,native-country_France,native-country_Germany,native-country_Greece,native-country_Guatemala,native-country_Haiti,native-country_Holand-Netherlands,native-country_Honduras,native-country_Hong,native-country_Hungary,native-country_India,native-country_Iran,native-country_Ireland,native-country_Italy,native-country_Jamaica,native-country_Japan,native-country_Laos,native-country_Mexico,native-country_Nicaragua,native-country_Outlying-US(Guam-USVI-etc),native-country_Peru,native-country_Philippines,native-country_Poland,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,13,2174,0,40,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,50,13,0,0,13,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,38,9,0,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,53,7,0,0,40,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,28,13,0,0,40,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


##### Divide X, y

In [12]:
x = df.drop('income >50K', axis=1)
y = df['income >50K']

##### Train Test Splite

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)
x_train.shape

(24420, 96)

##### Over sampling

In [14]:
smt = SMOTE()
x_train_smt, y_train_smt = smt.fit_resample(x_train, y_train)

##### Scale

In [15]:
sc = StandardScaler()
x_train_final = sc.fit_transform(x_train_smt)
x_test_final = sc.transform(x_test)

##### Decision Tree

In [16]:
dtm = DecisionTreeClassifier(max_depth=12, criterion='entropy', random_state=42)
dtm.fit(x_train_final, y_train_smt)

y_pred_dtm = dtm.predict(x_test_final)
print('\n\n', classification_report(y_test, y_pred_dtm))
confusion_matrix(y_test, y_pred_dtm)



               precision    recall  f1-score   support

           0       0.92      0.84      0.88      6214
           1       0.61      0.78      0.68      1927

    accuracy                           0.83      8141
   macro avg       0.76      0.81      0.78      8141
weighted avg       0.85      0.83      0.83      8141



array([[5239,  975],
       [ 432, 1495]])

##### Random Forest

In [17]:
rfm = RandomForestClassifier(n_estimators=115, criterion='entropy', max_depth=30, random_state=42, bootstrap=True, class_weight='balanced')
rfm.fit(x_train_final, y_train_smt)

y_pred_rfm = rfm.predict(x_test_final)
print('\n\n', classification_report(y_test, y_pred_rfm))
confusion_matrix(y_test, y_pred_rfm)



               precision    recall  f1-score   support

           0       0.92      0.88      0.90      6214
           1       0.65      0.75      0.70      1927

    accuracy                           0.85      8141
   macro avg       0.78      0.81      0.80      8141
weighted avg       0.85      0.85      0.85      8141



array([[5441,  773],
       [ 486, 1441]])

##### XGBOOST

In [18]:
xgb = XGBClassifier()
xgb.fit(x_train_final, y_train_smt)

y_pred_xgb = xgb.predict(x_test_final)
print('\n\n', classification_report(y_test, y_pred_xgb))
confusion_matrix(y_test, y_pred_xgb)



               precision    recall  f1-score   support

           0       0.92      0.89      0.91      6214
           1       0.68      0.76      0.72      1927

    accuracy                           0.86      8141
   macro avg       0.80      0.82      0.81      8141
weighted avg       0.86      0.86      0.86      8141



array([[5526,  688],
       [ 469, 1458]])

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           scoring='roc_auc', cv=3, verbose=2, n_jobs=-1)

grid_search.fit(x_train, y_train)

print("Best parameters found: ", grid_search.best_params_)

y_pred = grid_search.predict(x_test)
y_proba = grid_search.predict_proba(x_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters found:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.94      0.92      6214
           1       0.79      0.65      0.71      1927

    accuracy                           0.88      8141
   macro avg       0.84      0.80      0.82      8141
weighted avg       0.87      0.88      0.87      8141

ROC-AUC Score: 0.9302901578687427
