In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import cross_val_score,cross_val_predict,train_test_split,cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_curve,roc_auc_score, accuracy_score
from datetime import datetime

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        return f"{thour} hours {tmin} minutes and {np.round(tsec, 2)} seconds."

# **Read Dataset**

In [None]:
# import dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'

df_raw = pd.read_csv(url,names=["Age", "Workclass", "Final Weight", "Education", "Education Number of Years", "Marital-status",
                            "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss","Hours-per-week", "Native-country",
                            "Target"], na_values='?')

In [None]:
df=df_raw.copy()
# let´s find rows with missing data
na_columns=df.columns[df.isna().any()].tolist()
# and replace them with the mode
df[na_columns] = df[na_columns].apply(lambda x: x.fillna(x.mode()[0]))
# drop Final Weight
df.drop('Final Weight', axis=1, inplace=True)
# drop Education as it correlates with Education Number of Years
df.drop('Education', axis=1, inplace=True)
# label encode target
mapping={"<=50K":0,
         '>50K':1}
df.loc[:,"Target"]=df["Target"].map(mapping)
# identify feature list
features=[f for f in df.columns if f not in ("Target")]
# select categorical and numerical features
cat_ix = df[features].select_dtypes(include=['object', 'bool']).columns
num_ix = df[features].select_dtypes(include=['int64', 'float64']).columns

# **Pre-processing**

In [None]:
# simplifing no of categories
# workclass will have only 4 (instead of 7)
mapping_workclass={'State-gov':"Public",
                   'Self-emp-not-inc':"Self",
                   'Private':"Private",
                   'Federal-gov':"Public",
                   'Local-gov': "Public",
                   'Self-emp-inc':"Self",
                   "Without-pay":"Without-pay"}
# marital status will have only 4 (instead of 7)                   
mapping_marital={'Married-civ-spouse':"Married",
                  'Married-spouse-absent':"Divorced",
                  'Married-AF-spouse': "Married",
                   'Never-married':"Single",
                 'Separated':"Divorced",
                 'Divorced':'Divorced',
                 'Widowed':"Widowed"}
df.loc[:,"Workclass"]=df["Workclass"].map(mapping_workclass)
df.loc[:,"Marital-status"]=df["Marital-status"].map(mapping_marital)
# simplify Native country 
not_us=df["Native-country"].loc[df["Native-country"]!='United-States'].to_list()
df['Native-country']=df['Native-country'].replace(not_us,"Not USA")
# simplify Race 
not_white=df["Race"].loc[df["Race"]!='White'].to_list()
df['Race']=df['Race'].replace(not_white,"Not Whites")
# # split the dataframe into X and y
X=df.loc[:, df.columns != 'Target']
y=df.loc[:,'Target']
# perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **HP-grid**

In [None]:
# DEFINING HP GRIDS
# parameter grid for XGB
learning_rate_ = [.1,.3,.5]
n_estimators_ = [50,100,200]
max_depth_=[3,6]
grid_xgb = dict(learning_rate=learning_rate_,
            n_estimators=n_estimators_,
            max_depth=max_depth_)
# parameter grid for KNN
n_neighbors_=[5,10,50]
grid_knn = dict(n_neighbors=n_neighbors_)
# parameter grid for SVM
C_=[0.01,1.0,10.0]
grid_svm = {'C':C_}
# parameter grid for QDA
reg_param_=[0.0]
grid_qda=dict(reg_param=reg_param_)
# parameter grid for LogRes
C_=[0.01,0.1,1.0,10.0]
penalty_=["l1","l2"]
grid_lr=dict(C=C_,penalty=penalty_)
# parameter grid for RFs
n_estimators_=[100,300]
max_depth_=[5,8,None]
grid_rf=dict(n_estimators=n_estimators_,max_depth=max_depth_)
# scoring metrics
scoring = {'acc': 'accuracy',
          'prec': 'precision',
          'rec': 'recall',
           'f1':'f1'}
# initialize metric lists
accuracy=[]
precision=[]
recall=[]
f1=[]
auc=[]
names=[]
runtime=[]
cv_acc=[]
cv_prec=[]
cv_rec=[]
cv_f1=[]

# **XGB**

In [None]:
model=xgb.XGBClassifier(n_jobs=-1)
name="xgb"
grid=grid_xgb
# feature transformation for numericals
num_pipeline=Pipeline([("minmax_scaler",MinMaxScaler())])
# apply MinMax to numericals and OneHot to categoricals
transformation_pipeline=ColumnTransformer([("num",num_pipeline,num_ix),
                                           ("cat",OneHotEncoder(handle_unknown = "ignore"),cat_ix)])
# NESTED CROSS-VALIDATION: inner folds for HP search, outer for generaliz.error
# inner kfold for hp-search
inner_kfold=StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
# invoke grid-search CV to search for best hp
gscv= GridSearchCV(estimator=model,
                param_grid = grid,
                cv=inner_kfold,
                scoring='f1',
                verbose=0,
                n_jobs=-1,
                refit=True)
# define model pipeline: data transformation then grid-search
model_pipeline=Pipeline([("transformer",transformation_pipeline),
                         ("model",gscv)])
# perform cross-validation to estimate generaliz.error
skfold=StratifiedKFold(n_splits=5)
start_time = timer(None)
scores = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=skfold, n_jobs=-1)
time_off=timer(start_time)
# cross-validation metrics
acc_cv=np.mean(scores['test_acc'])
cv_acc.append(acc_cv)
f1_cv=np.mean(scores['test_f1'])
cv_f1.append(f1_cv)
prec_cv=np.mean(scores['test_prec'])
cv_prec.append(prec_cv)
rec_cv=np.mean(scores['test_rec'])
cv_rec.append(rec_cv)
# fit pipeline on (X_train,y_train) to return best model
model_pipeline.fit(X_train,y_train)
# make preditions using best_model
predictions=model_pipeline.predict(X_test)
# store classification metrics
names.append(name)
accuracy.append(accuracy_score(y_test,predictions))
precision.append(precision_score(y_test,predictions))
recall.append(recall_score(y_test,predictions))
f1.append(f1_score(y_test,predictions))
auc.append(roc_auc_score(y_test,model_pipeline.predict_proba(X_test)[:,1]))
runtime.append(time_off)
print("################################################")
print("CROSS-VALIDATION METRICS")
print(f"CV Accuracy for {name}: {np.round(acc_cv,3)}")
print(f"CV Precision for {name}: {np.round(prec_cv,3)}")
print(f"CV Recall for {name}: {np.round(rec_cv,3)}")
print(f"CV F1 for {name}: {np.round(f1_cv,3)}")
print("################################################")


 Time taken: 0 hours 13 minutes and 22.3 seconds.
################################################
CROSS-VALIDATION METRICS
CV Accuracy for xgb: 0.871
CV Precision for xgb: 0.776
CV Recall for xgb: 0.646
CV F1 for xgb: 0.705
################################################


# **KNN**

In [None]:
model=KNeighborsClassifier()
name="knn"
grid=grid_knn
# feature transformation for numericals
num_pipeline=Pipeline([("minmax_scaler",MinMaxScaler())])
# apply MinMax to numericals and OneHot to categoricals
transformation_pipeline=ColumnTransformer([("num",num_pipeline,num_ix),
                                           ("cat",OneHotEncoder(handle_unknown = "ignore"),cat_ix)])
# NESTED CROSS-VALIDATION: inner folds for HP search, outer for generaliz.error
# inner kfold for hp-search
inner_kfold=StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
# invoke grid-search CV to search for best hp
gscv= GridSearchCV(estimator=model,
                param_grid = grid,
                cv=inner_kfold,
                scoring='f1',
                verbose=0,
                n_jobs=-1,
                refit=True)
# define model pipeline: data transformation then grid-search
model_pipeline=Pipeline([("transformer",transformation_pipeline),
                         ("model",gscv)])
# perform cross-validation to estimate generaliz.error
skfold=StratifiedKFold(n_splits=5)
start_time = timer(None)
scores = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=skfold, n_jobs=-1)
time_off=timer(start_time)
# cross-validation metrics
acc_cv=np.mean(scores['test_acc'])
cv_acc.append(acc_cv)
f1_cv=np.mean(scores['test_f1'])
cv_f1.append(f1_cv)
prec_cv=np.mean(scores['test_prec'])
cv_prec.append(prec_cv)
rec_cv=np.mean(scores['test_rec'])
cv_rec.append(rec_cv)
# fit pipeline on (X_train,y_train) to return best model
model_pipeline.fit(X_train,y_train)
# make preditions using best_model
predictions=model_pipeline.predict(X_test)
# store classification metrics
names.append(name)
accuracy.append(accuracy_score(y_test,predictions))
precision.append(precision_score(y_test,predictions))
recall.append(recall_score(y_test,predictions))
f1.append(f1_score(y_test,predictions))
auc.append(roc_auc_score(y_test,model_pipeline.predict_proba(X_test)[:,1]))
runtime.append(time_off)
print("################################################")
print("CROSS-VALIDATION METRICS")
print(f"CV Accuracy for {name}: {np.round(acc_cv,3)}")
print(f"CV Precision for {name}: {np.round(prec_cv,3)}")
print(f"CV Recall for {name}: {np.round(rec_cv,3)}")
print(f"CV F1 for {name}: {np.round(f1_cv,3)}")
print("################################################")




 Time taken: 0 hours 3 minutes and 16.41 seconds.




################################################
CROSS-VALIDATION METRICS
CV Accuracy for knn: 0.832
CV Precision for knn: 0.687
CV Recall for knn: 0.544
CV F1 for knn: 0.607
################################################


# **Logistic Regression**

In [None]:
from sklearn.preprocessing import FunctionTransformer
model=LogisticRegression()
name="lr"
grid=grid_lr
# feature transformation for numericals
num_pipeline=Pipeline([("minmax_scaler",MinMaxScaler())])
# apply MinMax to numericals and OneHot to categoricals
transformation_pipeline=ColumnTransformer([("num",num_pipeline,num_ix),
                                           ("cat",OneHotEncoder(handle_unknown = "ignore"),cat_ix)])
# NESTED CROSS-VALIDATION: inner folds for HP search, outer for generaliz.error
# inner kfold for hp-search
inner_kfold=StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
# invoke grid-search CV to search for best hp
gscv= GridSearchCV(estimator=model,
                param_grid = grid,
                cv=inner_kfold,
                scoring='f1',
                verbose=0,
                n_jobs=-1,
                refit=True)
# define model pipeline: data transformation then grid-search
model_pipeline=Pipeline([("transformer",transformation_pipeline),
                         ("model",gscv)])
# perform cross-validation to estimate generaliz.error
skfold=StratifiedKFold(n_splits=5)
start_time = timer(None)
scores = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=skfold, n_jobs=-1)
time_off=timer(start_time)
# cross-validation metrics
acc_cv=np.mean(scores['test_acc'])
cv_acc.append(acc_cv)
f1_cv=np.mean(scores['test_f1'])
cv_f1.append(f1_cv)
prec_cv=np.mean(scores['test_prec'])
cv_prec.append(prec_cv)
rec_cv=np.mean(scores['test_rec'])
cv_rec.append(rec_cv)
# fit pipeline on (X_train,y_train) to return best model
model_pipeline.fit(X_train,y_train)
# make preditions using best_model
predictions=model_pipeline.predict(X_test)
# store classification metrics
names.append(name)
accuracy.append(accuracy_score(y_test,predictions))
precision.append(precision_score(y_test,predictions))
recall.append(recall_score(y_test,predictions))
f1.append(f1_score(y_test,predictions))
auc.append(roc_auc_score(y_test,model_pipeline.predict_proba(X_test)[:,1]))
runtime.append(time_off)
print("################################################")
print("CROSS-VALIDATION METRICS")
print(f"CV Accuracy for {name}: {np.round(acc_cv,3)}")
print(f"CV Precision for {name}: {np.round(prec_cv,3)}")
print(f"CV Recall for {name}: {np.round(rec_cv,3)}")
print(f"CV F1 for {name}: {np.round(f1_cv,3)}")
print("################################################")


 Time taken: 0 hours 0 minutes and 18.34 seconds.


12 fits failed out of a total of 24.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/linear_model/_logistic.py", line 449, in _check_solver
    % (solver, penalty)
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.65393571]


################################################
CROSS-VALIDATION METRICS
CV Accuracy for lr: 0.85
CV Precision for lr: 0.729
CV Recall for lr: 0.593
CV F1 for lr: 0.654
################################################


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# **SVM**

In [None]:
from sklearn.calibration import CalibratedClassifierCV
model=LinearSVC()
name="svm"
grid=grid_svm
# feature transformation for numericals
num_pipeline=Pipeline([("minmax_scaler",StandardScaler())])
# apply MinMax to numericals and OneHot to categoricals
transformation_pipeline=ColumnTransformer([("num",num_pipeline,num_ix),
                                           ("cat",OneHotEncoder(handle_unknown = "ignore"),cat_ix)])
# NESTED CROSS-VALIDATION: inner folds for HP search, outer for generaliz.error
# inner kfold for hp-search
inner_kfold=StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
# invoke grid-search CV to search for best hp
gscv= GridSearchCV(estimator=model,
                param_grid = grid,
                cv=inner_kfold,
                scoring='f1',
                verbose=0,
                n_jobs=-1,
                refit=True)
gscv=CalibratedClassifierCV(gscv)
# define model pipeline: data transformation then grid-search
model_pipeline=Pipeline([("transformer",transformation_pipeline),
                         ("model",gscv)])
# perform cross-validation to estimate generaliz.error
skfold=StratifiedKFold(n_splits=5)
start_time = timer(None)
scores = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=skfold, n_jobs=-1)
time_off=timer(start_time)
# cross-validation metrics
acc_cv=np.mean(scores['test_acc'])
cv_acc.append(acc_cv)
f1_cv=np.mean(scores['test_f1'])
cv_f1.append(f1_cv)
prec_cv=np.mean(scores['test_prec'])
cv_prec.append(prec_cv)
rec_cv=np.mean(scores['test_rec'])
cv_rec.append(rec_cv)
# fit pipeline on (X_train,y_train) to return best model
model_pipeline.fit(X_train,y_train)
# make preditions using best_model
predictions=model_pipeline.predict(X_test)
# store classification metrics
names.append(name)
accuracy.append(accuracy_score(y_test,predictions))
precision.append(precision_score(y_test,predictions))
recall.append(recall_score(y_test,predictions))
f1.append(f1_score(y_test,predictions))
auc.append(roc_auc_score(y_test,model_pipeline.predict_proba(X_test)[:,1]))
runtime.append(time_off)
print("################################################")
print("CROSS-VALIDATION METRICS")
print(f"CV Accuracy for {name}: {np.round(acc_cv,3)}")
print(f"CV Precision for {name}: {np.round(prec_cv,3)}")
print(f"CV Recall for {name}: {np.round(rec_cv,3)}")
print(f"CV F1 for {name}: {np.round(f1_cv,3)}")
print("################################################")


 Time taken: 0 hours 4 minutes and 19.77 seconds.




################################################
CROSS-VALIDATION METRICS
CV Accuracy for svm: 0.851
CV Precision for svm: 0.731
CV Recall for svm: 0.593
CV F1 for svm: 0.654
################################################


# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
name="rf"
grid=grid_rf
# feature transformation for numericals
num_pipeline=Pipeline([("minmax_scaler",MinMaxScaler())])
# apply MinMax to numericals and OneHot to categoricals
transformation_pipeline=ColumnTransformer([("num",num_pipeline,num_ix),
                                           ("cat",OneHotEncoder(handle_unknown = "ignore"),cat_ix)])
# NESTED CROSS-VALIDATION: inner folds for HP search, outer for generaliz.error
# inner kfold for hp-search
inner_kfold=StratifiedKFold(n_splits=3,shuffle=True,random_state=1)
# invoke grid-search CV to search for best hp
gscv= GridSearchCV(estimator=model,
                param_grid = grid,
                cv=inner_kfold,
                scoring='f1',
                verbose=0,
                n_jobs=-1,
                refit=True)
# define model pipeline: data transformation then grid-search
model_pipeline=Pipeline([("transformer",transformation_pipeline),
                         ("model",gscv)])
# perform cross-validation to estimate generaliz.error
skfold=StratifiedKFold(n_splits=5)
start_time = timer(None)
scores = cross_validate(model_pipeline, X_train, y_train, scoring=scoring, cv=skfold, n_jobs=-1)
time_off=timer(start_time)
# cross-validation metrics
acc_cv=np.mean(scores['test_acc'])
cv_acc.append(acc_cv)
f1_cv=np.mean(scores['test_f1'])
cv_f1.append(f1_cv)
prec_cv=np.mean(scores['test_prec'])
cv_prec.append(prec_cv)
rec_cv=np.mean(scores['test_rec'])
cv_rec.append(rec_cv)
# fit pipeline on (X_train,y_train) to return best model
model_pipeline.fit(X_train,y_train)
# make preditions using best_model
predictions=model_pipeline.predict(X_test)
# store classification metrics
names.append(name)
accuracy.append(accuracy_score(y_test,predictions))
precision.append(precision_score(y_test,predictions))
recall.append(recall_score(y_test,predictions))
f1.append(f1_score(y_test,predictions))
auc.append(roc_auc_score(y_test,model_pipeline.predict_proba(X_test)[:,1]))
runtime.append(time_off)
print("################################################")
print("CROSS-VALIDATION METRICS")
print(f"CV Accuracy for {name}: {np.round(acc_cv,3)}")
print(f"CV Precision for {name}: {np.round(prec_cv,3)}")
print(f"CV Recall for {name}: {np.round(rec_cv,3)}")
print(f"CV F1 for {name}: {np.round(f1_cv,3)}")
print("################################################")


 Time taken: 0 hours 4 minutes and 2.12 seconds.
################################################
CROSS-VALIDATION METRICS
CV Accuracy for rf: 0.846
CV Precision for rf: 0.704
CV Recall for rf: 0.613
CV F1 for rf: 0.656
################################################


# **Results Test Set**

In [None]:
results=np.column_stack((names,accuracy,precision,recall,f1,auc,runtime))  
results_df=pd.DataFrame(results)
results_df.columns=["name","accuracy","precision","recall","f1","auc","runtime"]
results_df.to_excel("results_test.xlsx")
results_df.head(results_df.shape[0])

Unnamed: 0,name,accuracy,precision,recall,f1,auc,runtime
0,xgb,0.8741938785955574,0.7914078674948241,0.6492569002123142,0.7133193375320738,0.9263397932527948,0.0 hours 13.0 minutes and 22.3 seconds.
1,knn,0.8196335346504248,0.6510443199184921,0.5426751592356688,0.5919407132931912,0.837972545199104,0.0 hours 3.0 minutes and 16.41 seconds.
2,lr,0.8499334629951889,0.7406605305901461,0.5808917197452229,0.6511185149928606,0.902584454612465,0.0 hours 0.0 minutes and 18.34 seconds.
3,svm,0.8509571092230526,0.744426318651441,0.5813163481953291,0.6528373867429661,0.9025823354793852,0.0 hours 4.0 minutes and 19.77 seconds.
4,rf,0.8437915856280069,0.7057071960297767,0.6038216560509554,0.6508009153318078,0.8910669663235389,0.0 hours 4.0 minutes and 2.12 seconds.


# **Results CV**

In [None]:
results_cv=np.column_stack((names,cv_acc,cv_prec,cv_rec,cv_f1))  
resultscv_df=pd.DataFrame(results_cv)
resultscv_df.columns=["name","accuracy","precision","recall","f1"]
resultscv_df.to_excel("results_cv.xlsx")
resultscv_df.head(resultscv_df.shape[0])

Unnamed: 0,name,accuracy,precision,recall,f1
0,xgb,0.8710105440945357,0.7761371205850703,0.6463761351756777,0.7053088603455635
1,knn,0.8315459349911023,0.6866795181430818,0.5437176181791148,0.6065720039253542
2,lr,0.8502801327276321,0.729376869914389,0.5932256584865703,0.6542673866269016
3,svm,0.8505616622951964,0.73080800716529,0.592582743894493,0.6544568974666749
4,rf,0.8461341120265626,0.7042387040729927,0.613372726849994,0.6556561205843432
