In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import cross_val_score,cross_val_predict,train_test_split,cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_curve,roc_auc_score, accuracy_score
from datetime import datetime

In [None]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
        return f"{thour} hours {tmin} minutes and {np.round(tsec, 2)} seconds."

In [None]:
# import dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
df_raw = pd.read_csv(url,names=["Age", "Workclass", "Final Weight", "Education", "Education Number of Years", "Marital-status",
                            "Occupation", "Relationship", "Race", "Sex", "Capital-gain", "Capital-loss","Hours-per-week", "Native-country",
                            "Target"], na_values='?')

In [None]:
df=df_raw.copy()
# let´s drop rows with missing data
df = df.dropna()
# label encode target
mapping={"<=50K":0,
         '>50K':1}
df.loc[:,"Target"]=df["Target"].map(mapping)
# identify feature list
features=[f for f in df.columns if f not in ("Target")]
# select categorical and numerical features
cat_ix = df[features].select_dtypes(include=['object', 'bool']).columns
num_ix = df[features].select_dtypes(include=['int64', 'float64']).columns
# split the dataframe into X and y
X=df.loc[:, df.columns != 'Target']
y=df.loc[:,'Target']
# transform numerical features
scaler=MinMaxScaler()
Xnum=scaler.fit_transform(X[num_ix])
# dataframe of numericals
Xnum_df=pd.DataFrame(Xnum,columns=X[num_ix].columns)
num_feat_names=X[num_ix].columns.to_list()
# transform categorical features
onehot=OneHotEncoder(handle_unknown = "ignore")
Xcat=onehot.fit_transform(X[cat_ix])
cat_feat_names=onehot.get_feature_names().tolist()
# dataframe of categoricals
Xcat_df=pd.DataFrame(Xcat.toarray(),columns=cat_feat_names)
# concatenate nums+cats
X_df = pd.concat([Xnum_df.reset_index(drop=True), Xcat_df.reset_index(drop=True)], axis=1)
# # perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)



In [None]:
from keras.models import Sequential,Model
from keras.layers import Dense,Input,BatchNormalization,Dropout
import tensorflow as tf
from tensorflow import random
from keras.wrappers.scikit_learn import KerasRegressor,KerasClassifier

def create_dense(neurons):
  inputs=Input(shape=(104))
  x=inputs
  x=Dense(neurons,activation="relu")(x)
  x=Dense(neurons/2,activation="relu")(x)
  x=Dense(1,activation='sigmoid')(x)
  model=Model(inputs=inputs,outputs=x)
  model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc',f1_m,precision_m, recall_m])
  return model
  
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))  

# **Grid-search CV**

In [None]:
# define lists
accuracy=[]
precision=[]
recall=[]
f1=[]
names=[]
runtime=[]
cverror=[]
# define initial params
name="mlp"
thr=0.5
# cross-validation fold for GridSearch
cv_outer = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
# covert to dataframe to array
X_train=np.array(X_train)
y_train=np.array(y_train)
# turn on chronometer
start_time = timer(None)
for train_ix, test_ix in cv_outer.split(X_train,y_train):
  # set seeds
  tf.keras.backend.clear_session()
  np.random.seed(123)
  tf.random.set_seed(123)
	# split data
  Xtrain, Xval = X_train[train_ix, :], X_train[test_ix, :]
  ytrain, yval = y_train[train_ix], y_train[test_ix]
	# define search space
  space = dict()
  space['neurons'] = [64,128]
	# call grid-search
  model=KerasClassifier(build_fn=create_dense, verbose=0, epochs=50)
  search = GridSearchCV(model, space, scoring='f1', cv=cv_outer,verbose=0, refit=True)
	# execute search on training fold
  result = search.fit(Xtrain, ytrain)
	# get the best performing model 
  best_model = result.best_estimator_
	# evaluate model on the val fold
  yhat = best_model.predict(Xval)
  # infer classes given threshold
  predicted_classes = [1 * (x>=thr) for x in yhat]
  predicted_classes=np.array(predicted_classes)
	# evaluate metrics
  accuracy.append(accuracy_score(yval, predicted_classes))
  precision.append(precision_score(yval, predicted_classes))
  recall.append(recall_score(yval, predicted_classes))
  f1.append(f1_score(yval, predicted_classes))
time_off=timer(start_time)
runtime.append(time_off)
names.append(name)
# save CV results to excel
results=np.column_stack((names,np.mean(accuracy),
                         np.mean(precision),np.mean(recall),np.mean(f1),runtime))  
results_df=pd.DataFrame(results)
results_df.columns=["name","accuracy","precision","recall","f1","runtime"]
results_df.to_excel("results_cv.xlsx")
results_df.head(results_df.shape[0])




 Time taken: 0 hours 46 minutes and 6.22 seconds.


Unnamed: 0,name,accuracy,precision,recall,f1,runtime
0,mlp,0.8283163335821101,0.6620545433785247,0.6195568196466367,0.6399621603275794,0.0 hours 46.0 minutes and 6.22 seconds.


# **Predictions on test set**

In [None]:
# define lists
accuracy=[]
precision=[]
recall=[]
f1=[]
names=[]
auc=[]
model=KerasClassifier(build_fn=create_dense, verbose=0, epochs=50)
# call grid-search
search = GridSearchCV(model, space, scoring='f1', cv=cv_outer, refit=True)
# execute search on entire training set
result = search.fit(X_train, y_train)
# get the best performing model 
best_model = result.best_estimator_
# fit pipeline on (X_train,y_train) to return best model
best_model.fit(X_train,y_train)
# make preditions using best_model
predictions=best_model.predict(X_test)
predicted_classes = [1 * (x>=thr) for x in predictions]
# store classification metrics
name="mlp"
names.append(name)
accuracy.append(accuracy_score(y_test,predicted_classes))
precision.append(precision_score(y_test,predicted_classes))
recall.append(recall_score(y_test,predicted_classes))
f1.append(f1_score(y_test,predicted_classes))
auc.append(roc_auc_score(y_test,predictions))
# save metrics to excel
results=np.column_stack((names,accuracy,precision,recall,f1,auc))  
results_df=pd.DataFrame(results)
results_df.columns=["name","accuracy","precision","recall","f1","auc"]
results_df.to_excel("results.xlsx")
results_df.head(results_df.shape[0])