In [11]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier
import eli5


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [12]:
def label_encode(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    label_encoder = LabelEncoder()
    for column in columns:
        df[column] = label_encoder.fit_transform(df[column])
    return df

### Data prep

In [13]:
df_orig = pd.read_csv("../data/BankABC_preprocessed.csv")
df = df_orig.copy()
df.head()

Unnamed: 0,institute,age,workclass,education,marital-status,occupation,relationship,hours-per-week,native-country,income,capital-diff
0,Bank A,39,Private,Bachelors,Married,high,Parent,50,North-America,1,7298
1,Bank A,42,Private,Bachelors,Married,medium,Parent,55,North-America,1,0
2,Bank A,56,unknown,Bachelors,Married-spouse-absent,unknown,Other-relative,15,North-America,0,0
3,Bank A,34,Private,Bachelors,Married,medium,Parent,40,North-America,0,0
4,Bank A,66,Self-emp-inc,Bachelors,Married,medium,Parent,40,North-America,1,15024


In [15]:
columns = ['institute','workclass', 'education', 'marital-status', 'occupation', 'relationship', 'native-country']

df = label_encode(df, columns)

X = df.drop('income', axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

### Modeling 

In [16]:
# Define and train XGBoost model
params = {'objective': 'binary:logistic', 'eval_metric': 'logloss', 'use_label_encoder': False}
num_rounds = 100

xgb_model = xgb.train(params, dtrain, num_rounds)

# Make predictions
y_pred = xgb_model.predict(dtest)

# Convert probabilities to binary predictions
predictions = [1 if prob >= 0.5 else 0 for prob in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: {:.4f}".format(accuracy))

Accuracy: 0.8626


In [110]:
# using XGBClassifier
xgbc = XGBClassifier()

xgbc_preditions = xgbc.fit(X_train, y_train)

scores = cross_val_score(xgbc, X_train, y_train, cv=5)
print("Mean cross-validation score: %.4f" % scores.mean())

Mean cross-validation score: 0.8618


In [62]:
# Define the parameter grid for grid search
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Create an XGBoost classifier
xgb_model = XGBClassifier(objective='binary:logistic', use_label_encoder=False)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}".format(grid_search.best_score_))

# Make predictions with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the best model on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy with Best Model: {:.4f}".format(accuracy))

Best Parameters:  {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best Accuracy: 0.86
Test Accuracy with Best Model: 0.86


In [32]:
print("Test Accuracy with Best Model: {:.4f}".format(accuracy))

Test Accuracy with Best Model: 0.8626


In [111]:
eli5.explain_weights_xgboost(xgb_model)

Weight,Feature
0.3376,marital-status
0.3004,relationship
0.1316,occupation
0.1005,capital-diff
0.0623,education
0.0293,age
0.0143,hours-per-week
0.01,workclass
0.0089,institute
0.0051,native-country


In [22]:
df_a = label_encode(df_orig[df_orig['institute']=='Bank A'], columns)
df_b = label_encode(df_orig[df_orig['institute']=='Bank B'], columns)
df_c = label_encode(df_orig[df_orig['institute']=='Bank C'], columns)

X_a = df_a.drop(['income', 'institute'], axis=1)
y_a = df_a['income']
X_b = df_b.drop(['income', 'institute'], axis=1)
y_b = df_b['income']
X_c = df_c.drop(['income', 'institute'], axis=1)
y_c = df_c['income']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X_a, y_a, test_size=0.2, random_state=42)
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(X_b, y_b, test_size=0.2, random_state=42)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c, y_c, test_size=0.2, random_state=42)

dtrain_a = xgb.DMatrix(X_train_a, label=y_train_a)
dtest_a = xgb.DMatrix(X_test_a, label=y_test_a)
dtrain_b = xgb.DMatrix(X_train_b, label=y_train_b)
dtest_b = xgb.DMatrix(X_test_b, label=y_test_b)
dtrain_c = xgb.DMatrix(X_train_c, label=y_train_c)
dtest_c = xgb.DMatrix(X_test_c, label=y_test_c)

In [33]:
# models
xgb_model_a = xgb.train(params, dtrain_a, num_rounds)
xgb_model_b = xgb.train(params, dtrain_b, num_rounds)
xgb_model_c = xgb.train(params, dtrain_c, num_rounds)

# Make predictions
y_pred_a = xgb_model_a.predict(dtest_a)
y_pred_b = xgb_model_b.predict(dtest_b)
y_pred_c = xgb_model_c.predict(dtest_c)

# Convert probabilities to binary predictions
predictions_a = [1 if prob >= 0.5 else 0 for prob in y_pred_a]
predictions_b = [1 if prob >= 0.5 else 0 for prob in y_pred_b]
predictions_c = [1 if prob >= 0.5 else 0 for prob in y_pred_c]

# Evaluate the model
accuracy_a = accuracy_score(y_test_a, predictions_a)
accuracy_b = accuracy_score(y_test_b, predictions_b)
accuracy_c = accuracy_score(y_test_c, predictions_c)

print("Accuracy model a: {:.4f}".format(accuracy_a))
print("Accuracy model b: {:.4f}".format(accuracy_b))
print("Accuracy model c: {:.4f}".format(accuracy_c))

Accuracy model a: 0.8358
Accuracy model b: 0.8631
Accuracy model c: 0.8889


In [34]:
eli5.show_weights(xgb_model_a)

Weight,Feature
0.3486,marital-status
0.3234,relationship
0.1262,occupation
0.0998,capital-diff
0.0425,education
0.0255,age
0.0152,hours-per-week
0.0115,workclass
0.0073,native-country


In [35]:
eli5.show_weights(xgb_model_b)

Weight,Feature
0.4208,marital-status
0.2601,relationship
0.1173,capital-diff
0.088,occupation
0.056,education
0.0271,age
0.0152,hours-per-week
0.0091,workclass
0.0063,native-country


In [36]:
eli5.show_weights(xgb_model_c)

Weight,Feature
0.4865,marital-status
0.1716,capital-diff
0.1302,occupation
0.1058,education
0.036,relationship
0.0302,age
0.0179,hours-per-week
0.0126,workclass
0.0093,native-country
