In [1]:
# IMPORT GENERIC PACKAGES
import numpy as np # numerical calc package
import pandas as pd # holds data
import matplotlib.pyplot as plt # plotting library
import seaborn as sns # pretty plotting
import pandas_profiling as pp
import datetime

import xgboost as xgb

In [2]:
sns.set(style='white', rc={'figure.figsize':(20,10)})

from sklearn.linear_model import LinearRegression # linear regression package
from sklearn.model_selection import train_test_split # split dataset
from sklearn.metrics import mean_squared_error as mse # Measurement metric

import sklearn.metrics as metrics
from tabulate import tabulate

from sklearn.preprocessing import StandardScaler

In [3]:
import os
cwd = os.getcwd()
print(cwd)
os.chdir('C:/Users/mox.ballo/OneDrive/FTW/Lectures/week6/exercise/FTW3-Day6')
data = pd.read_csv('census.csv')

C:\Users\mox.ballo\OneDrive\FTW\Lectures\week6\exercise\FTW3-Day6


In [4]:
# Dummy dataframes
workclass = pd.get_dummies(data['workclass'])
educ = pd.get_dummies(data['education_level'])
civstat = pd.get_dummies(data['marital-status'])
occup = pd.get_dummies(data['occupation'])
relat = pd.get_dummies(data['relationship'])
race = pd.get_dummies(data['race'])
sex = pd.get_dummies(data['sex'])
natco = pd.get_dummies(data['native-country'])


In [5]:
income = pd.get_dummies(data['income'])
data['capgain']=np.log(data['capital-gain'] +1 )
data['caploss']=np.log(data['capital-loss'] +1 )



In [6]:
dummies = list(workclass.columns.values) + list(educ.columns.values) + list(civstat.columns.values) + list(occup.columns.values) + list(relat.columns.values) + list(race.columns.values) + list(sex.columns.values) + list(natco.columns.values)

In [7]:
datareg = pd.concat([data, workclass, educ, civstat, occup, relat, race, sex, natco, income], axis=1, sort=False)

In [8]:
features = dummies + ['age','education-num', 'capgain', 'caploss', 'hours-per-week']  

In [9]:
X = datareg[features]
y = datareg['>50K']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

In [11]:
xg_reg = xgb.XGBClassifier()
xg_reg.fit(X_train, y_train)
y_predicted = xg_reg.predict(X_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))
           
# accuracy = metrics.accuracy_score(y_test, y_predicted)
# precision = metrics.precision_score(y_test, y_predicted)
# recall = metrics.recall_score(y_test, y_predicted)

# print(f"Accuracy = {accuracy}, precision = {precision}, and recall = {recall} ")

              precision    recall  f1-score   support

           0       0.88      0.95      0.91     13636
           1       0.79      0.60      0.68      4453

    accuracy                           0.86     18089
   macro avg       0.84      0.77      0.80     18089
weighted avg       0.86      0.86      0.86     18089



# Generate Synthetic Samples

In [12]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X, y)

In [13]:
Xres_train, Xres_test, yres_train, yres_test = train_test_split(X_res, y_res, test_size = 0.4, random_state = 42)

xg_reg = xgb.XGBClassifier(objective= 'binary:logistic', nthread=4,seed=42)
xg_reg.fit(Xres_train, yres_train)
yres_predicted = xg_reg.predict(Xres_test)

In [14]:
print(classification_report(yres_test, yres_predicted))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86     13595
           1       0.84      0.91      0.87     13617

    accuracy                           0.87     27212
   macro avg       0.87      0.87      0.86     27212
weighted avg       0.87      0.87      0.86     27212



In [None]:
#ROC Plot

fpr, tpr, threshold = metrics.roc_curve(yres_test, yres_predicted)
roc_auc = metrics.auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
cm = metrics.confusion_matrix(yres_test, yres_predicted)
annot_kws = {"ha": 'left',"va": 'top'} # position 
sns.heatmap(cm, annot = True, cmap='Blues', fmt='g')

# Pipeline v1

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pca = PCA()
xgb_reg_pca = xgb.XGBClassifier(objective= 'binary:logistic', nthread=4,seed=42)
pipeline = Pipeline(steps=[('pca', pca), ('xgb', xgb_reg_pca)])
pipeline.fit(Xres_train, yres_train)

yres_predicted = pipeline.predict(Xres_test)
print(classification_report(yres_test, yres_predicted))

              precision    recall  f1-score   support

           0       0.88      0.88      0.88     13595
           1       0.88      0.88      0.88     13617

    accuracy                           0.88     27212
   macro avg       0.88      0.88      0.88     27212
weighted avg       0.88      0.88      0.88     27212



# Pipeline v2

In [16]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
grid_search = GridSearchCV(
    estimator=xgb_reg_pca,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 3,
    verbose=True
)

In [18]:
grid_search.fit(Xres_train, yres_train)


Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  6.6min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed: 42.4min
[Parallel(n_jobs=10)]: Done 288 out of 288 | elapsed: 67.7min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=42, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=10,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
                         'max_depth': range(2, 10),
                         'n_estimators': range(60, 220, 40)},
         

In [19]:
yres_predicted = grid_search.predict(Xres_test)
print(classification_report(yres_test, yres_predicted))

              precision    recall  f1-score   support

           0       0.91      0.88      0.90     13595
           1       0.89      0.91      0.90     13617

    accuracy                           0.90     27212
   macro avg       0.90      0.90      0.90     27212
weighted avg       0.90      0.90      0.90     27212

