In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
import seaborn as sns

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split # split dataset

from sklearn.feature_selection import SelectKBest, chi2, RFE

In [2]:
df = pd.read_csv('census.csv')
df.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [3]:
features = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'income']
dummy_columns = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
dummy_df = pd.get_dummies(df[dummy_columns])
all_df = pd.concat([df[features], dummy_df], axis=1)
all_df['income'] = all_df['income'].str.replace('<=50K', '0')
all_df['income'] = all_df['income'].str.replace('>50K', '1')
all_df['income'] = pd.to_numeric(all_df['income'])

train_cols = all_df.columns.values.tolist()
train_cols.remove('income')

X = all_df[train_cols]
y = all_df['income']

# Logistic Regression Values

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
model = LogisticRegression()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {}'.format(precision_score(y_test, y_pred)))
print('Recall: {}'.format(recall_score(y_test, y_pred)))
print('ROC AUC: {}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n{}'.format(confusion_matrix(y_test, y_pred)))

Accuracy: 0.838741776770413
Precision: 0.7179341657207718
Recall: 0.568156299124186
ROC AUC: 0.7476305109583968
Confusion Matrix:
[[12642   994]
 [ 1923  2530]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# XGBoost

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)
model = XGBClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {}'.format(precision_score(y_test, y_pred)))
print('Recall: {}'.format(recall_score(y_test, y_pred)))
print('ROC AUC: {}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n{}'.format(confusion_matrix(y_test, y_pred)))

# Accuracy: 0.8634529271933219
# Precision: 0.8055469953775038
# Recall: 0.5870199865259376
# ROC AUC: 0.7703727096020712
# Confusion Matrix:
# [[13005   631]
#  [ 1839  2614]]

Accuracy: 0.8629001050362098
Precision: 0.7949177877428999
Recall: 0.5971255333483045
ROC AUC: 0.773408762567376
Confusion Matrix:
[[12950   686]
 [ 1794  2659]]


# KBest (on XGBoost)

In [6]:
X.shape

(45222, 87)

In [7]:
X_kbest = SelectKBest(chi2, k=20)
X_new = X_kbest.fit_transform(X,y)
X_new.shape
cols = X_kbest.get_support(indices=True)
for col in cols:
    print(all_df.iloc[:, col].name)

X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size = 0.4, random_state = 42)
model = XGBClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("----------")
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {}'.format(precision_score(y_test, y_pred)))
print('Recall: {}'.format(recall_score(y_test, y_pred)))
print('ROC AUC: {}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n{}'.format(confusion_matrix(y_test, y_pred)))

age
education-num
capital-gain
capital-loss
hours-per-week
workclass_ Private
workclass_ Without-pay
marital-status_ Married-AF-spouse
marital-status_ Married-spouse-absent
marital-status_ Widowed
occupation_ Craft-repair
occupation_ Machine-op-inspct
occupation_ Priv-house-serv
occupation_ Transport-moving
relationship_ Husband
relationship_ Other-relative
relationship_ Own-child
relationship_ Unmarried
race_ White
sex_ Female
----------
Accuracy: 0.8628448228204987
Precision: 0.7929292929292929
Recall: 0.5993712104199416
ROC AUC: 0.7741282570140189
Confusion Matrix:
[[12939   697]
 [ 1784  2669]]


# RFE

In [8]:
# selector = RFE(model, 5, step=1)
# selector = selector.fit(X, y)

In [9]:
# rank = selector.ranking_
# keep = selector.support_
# for c, x in enumerate(train_cols):
#     if keep[c]:
#         print (rank[c],'-',x,keep[c])

# k-Fold Cross Validation

In [10]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_new, y, cv=5)
print(scores)

[0.85616363 0.85992261 0.8641088  0.86101283 0.86045997]


# SMOTE

In [11]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.4, random_state = 42)
model = XGBClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {}'.format(precision_score(y_test, y_pred)))
print('Recall: {}'.format(recall_score(y_test, y_pred)))
print('ROC AUC: {}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n{}'.format(confusion_matrix(y_test, y_pred)))

# Accuracy: 0.8628448228204987
# Precision: 0.7929292929292929
# Recall: 0.5993712104199416
# ROC AUC: 0.7741282570140189

Accuracy: 0.8629281199470822
Precision: 0.8396427344555135
Recall: 0.8974810898142028
ROC AUC: 0.8629001624135375
Confusion Matrix:
[[11261  2334]
 [ 1396 12221]]


In [12]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_new, y)
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size = 0.4, random_state = 42)
model = XGBClassifier()

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision: {}'.format(precision_score(y_test, y_pred)))
print('Recall: {}'.format(recall_score(y_test, y_pred)))
print('ROC AUC: {}'.format(roc_auc_score(y_test, y_pred)))
print('Confusion Matrix:\n{}'.format(confusion_matrix(y_test, y_pred)))

Accuracy: 0.862964868440394
Precision: 0.832929292929293
Recall: 0.9083498567966513
ROC AUC: 0.8629281464932136
Confusion Matrix:
[[11114  2481]
 [ 1248 12369]]
