In [None]:
# import the propert libraries and load the dataset for modeling

import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import copy
import random as rand

data = pd.read_csv("c:\\Users\\markm\\Desktop\\CAPSTONE\\capstone\\data\\external\\watson_healthcare_modified.csv")

In [None]:
# take a look at the first 5 rows of data for familiarization
data.head()


In [None]:
# use the .info() call to see the total number of rows, columns, and datatypes of columns.
### Open the full output in a text editor if size limit is reached
data.info()

In [None]:
# let's slice the data into x and y, for building predictive models
y = data['Attrition']
x = data.drop(columns=['EmployeeID','Attrition'])

In [None]:
# look at the shape of the x and y dataframes to ensure they matchup
print(x.shape)
print(y.shape)

In [None]:
# look at the target variable to see if we have an imbalanced dataset
y.value_counts()

#### In this case, we do have an imbalanced dataset because there are ~88% 'No's and ~12% 'Yes's 

In [None]:
# Change the target variable to quantitative for modeling purposes
y.replace('No', 0, inplace=True)
y.replace('Yes', 1, inplace = True)

In [None]:
# plot histograms of each numerical value
# This will look at variances within each individual feature, allowing us to eliminate features without variance
import matplotlib.pyplot as plt

#fig = plt.figure(figsize = (20,12))
x.hist(figsize = (24,20))

plt.show()


In [None]:
# now let's look at some of the columns with the y value as well
fig, ax = plt.subplots(nrows=1, ncols=2, figsize = (20,10))


ax[0].scatter(data.Education[data.Attrition==0], data.Age[data.Attrition==0], c='b', label = 'stayed')


ax[1].scatter(data.Education[data.Attrition==1], data.Age[data.Attrition==1], c='r', label = 'attrited')


ax[0].set_xlabel('Education Level')
ax[0].set_ylabel('Age')
ax[1].set_xlabel('Education Level')

ax[0].set_title('Education level vs Age for Stayed', loc='center')
ax[1].set_title('Education level vs Age for Attrition', loc='center')

plt.show()

In [None]:
# Check the Over18 column to see if there is variation in this feature
x.Over18.unique()

In [None]:
# based off of this analysis, let's drop 'EmployeeCount' & 'StandardHours' because there is no variance in these features
x = x.drop(columns=['Over18','EmployeeCount','StandardHours'], axis=1)

## Now that we have initially analyzed the data, let's pass this data into 'LazyClassifier' to figure out which models could work best with the model

In [None]:
# import the necessary modules 
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

In [None]:
# split the x and y dataframes then call the LazyClassifier and display the results
x_train_l, x_test_l, y_train_l, y_test_l = train_test_split(x,y, test_size =0.2, random_state=42)

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

models, predictions = clf.fit(x_train_l,x_test_l,y_train_l,y_test_l)

models

In [None]:
# look at the datatypes that are not quantitative
data.select_dtypes(include=['object']).head(2)

In [None]:
# Pint the unique values of each 'object' feature
print("Department: ", data.Department.unique())
print("Business Travel: ",data.BusinessTravel.unique())
print("EducationField: ",data.EducationField.unique())
print("Gender: ",data.Gender.unique())
print("JobRole: ",data.JobRole.unique())
print("MaritalStatus	: ",data.MaritalStatus.unique())
#print("Over18: ",data.Over18.unique())
print("OverTime: ", data.OverTime.unique())


### Analysis
- I am going to encode these variables using 'get_dummies' for modeling purposes, so that all pertinent variables are quantitative

In [None]:
# example of the get_dummies call, including prefix for understanding of new variables
pd.get_dummies(x['Department'], prefix='Department')

In [None]:
### function for converting the 'object' type features to quantitative features
def convert_numeric(data):

    # Get dummy variables for 'object' columns
    data_ohe = data
    categorical_columns = data.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        col_ohe = pd.get_dummies(data[col], prefix=col)
        data_ohe = pd.concat((data_ohe, col_ohe), axis=1).drop(col, axis=1)

    return data_ohe

In [None]:
# convert the data and show the first two rows of the new dataframe
x = convert_numeric(x)

x.head(2)

In [None]:
# look at the names of the new columns
x.columns

## First Method : Logistic Regression

### Base model with unbalanced data (before using SMOTE)

In [None]:
# create the logistic regression  model
from sklearn.linear_model import LogisticRegression

# create the base instance of the model for exploration, setting the max_iterations high so the model can reach convergence
log_r = LogisticRegression(max_iter=100000)


In [None]:
# split the data after numeric conversion
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2, random_state=42)

# fit the data on the training data
log_r.fit(x_train, y_train)

In [None]:
# generate the predictions and show the overall 'accuracy' of the model 
predictions = log_r.predict(x_test)
score = log_r.score(x_test, y_test)
print(score)

### For better understanding of model performance, let's use a confusion matrix to show more in depth metrics

In [None]:
# import the necessary modules for creating the confusion matric
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report

cm = metrics.confusion_matrix(y_test, predictions)

In [None]:
# plot the confusion matrix and print the report
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {:.2f}%'.format(score*100, {"3f"})
plt.title(all_sample_title, size = 15)

print(classification_report(y_test,predictions))

plt.show()

### Let's try running the base model after balancing the data set to see if we have improvement

In [None]:
# # pip install imbalanced-learn
# import imblearn

from imblearn.over_sampling import SMOTE

# transform the dataset
oversample = SMOTE()
x_bal_train, y_bal_train = oversample.fit_resample(x_train, y_train)

In [None]:
# look at the new re_sampled data, showcasing balance in the training data
y_bal_train.value_counts()

In [None]:

# fit the data on the balanced training data
log_r.fit(x_bal_train, y_bal_train)

In [None]:
# generate predictions for the model trained on 'balanced' training data and show accuracy
predictions = log_r.predict(x_test)
logr_score = log_r.score(x_test, y_test)
print(logr_score)

In [None]:
# plot the confusion matrix of this 'balanced' log_r model and print the report for analysis
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {:.2f}%'.format(logr_score*100, {"3f"})
plt.title(all_sample_title, size = 15)

print(classification_report(y_test,predictions))

#### Overall, the balancing of the data actually worsened our results by a little bit. For this reason, I will choose not to use SMOTE for the log_r model
- Also, using SMOTE could cause explainability issues as it is a more complex technique. Therefore, keeping the data 'unbalanced' will allow for easier understanding of the model 

## Second Model: XGBoost Classifier

In [None]:
# Now let's create and test our XGBoost Classifier model
import xgboost as xgb

# create model instances for both 'unbalanced' and 'balanced' training data
xgb_cl = xgb.XGBClassifier()
xgb_bal_cl = xgb.XGBClassifier()

In [None]:
# split the data after numeric conversion
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size =0.2, random_state=42)

# transform the dataset using SMOTE 
oversample = SMOTE()
x_bal_train, y_bal_train = oversample.fit_resample(x_train, y_train)

# Fit the unbalanced model
xgb_cl.fit(x_train, y_train)

# Fit the 'balanced' model
xgb_bal_cl.fit(x_bal_train, y_bal_train)

In [None]:
# Predict on the 'unbalanced' model and show the overall accuracy
preds = xgb_cl.predict(x_test)

xgb_score = xgb_cl.score(x_test, y_test)
print(xgb_score)

In [None]:
cm = metrics.confusion_matrix(y_test, preds)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {:.2f}%'.format(xgb_score*100, {"3f"})
plt.title(all_sample_title, size = 15)

print(classification_report(y_test,preds))

In [None]:
# Predict on the 'balanced' model and show the overall accuracy
preds_bal = xgb_bal_cl.predict(x_test)

xgb_bal_score = xgb_bal_cl.score(x_test, y_test)
print(xgb_bal_score)

In [None]:
cm = metrics.confusion_matrix(y_test, preds_bal)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {:.2f}%'.format(xgb_bal_score*100, {"3f"})
plt.title(all_sample_title, size = 15)

print(classification_report(y_test,preds_bal))

#### For XGBOOST, I will use the 'balanced' model. Even though it's overall accuracy was slightly worse, It did have a lower false-negative number. This number is what I am trying to minimize (without having too much of a tradeoff in model performance), so the improvement in this category [Bottom left number where we predicted 'No' and it was actually 'Yes'] is worth the 1% loss in accuracy

## THIRD MODEL : Neural Network
- I will not be using this model in the actual 'run.py'. This attempt is just for fun to see if a 'simple' neural network could outperform logisitic regression / xgboost classifier

- Even though I kept this network very simple, it did take a lot longer to train than the other models. Adding in parameters / better architecture to try to improve performance would only add to the computation time to train the network, so I am not going to be pursuin this model type to solve the attrition problem

In [None]:
#define custom loss for unbalanced data set using inverse weights

class CrossEntropyLoss(tf.keras.losses.Loss):

    def __init__(self):

        super().__init__(name='CrossEntropyLoss')

 

    def call(self, y_true, y_pred):

        """

        Cross entropy loss adjusted for class imabalance and one-hot encoding sparsity

        """

        y_true = tf.cast(y_true, dtype=tf.float64)

        y_pred = tf.cast(y_pred, dtype=tf.float64)

 

        epsilon = tf.constant(1e-7, dtype=tf.float64) #avoid nans

        loss = (tf.math.log(y_pred+epsilon)*y_true + tf.math.log((1-y_pred)+epsilon) * (1-y_true))*-1.0

        tf.debugging.assert_all_finite(loss, 'There are nan values')

        return tf.reduce_sum(tf.reduce_mean(loss, axis = 0))

 

class ClassImbalanceSparsityAdjustedCEL(tf.keras.losses.Loss):

    def __init__(self, inverse_class_weights):

        """

        Initialization of inverse class weights

        """

        super().__init__(name='ClassImbalanceSparsityAdjustedCEL')

        self.inverse_class_weights = inverse_class_weights

 

    def call(self, y_true, y_pred):

        """

        Cross entropy loss adjusted for class imabalance and one-hot encoding sparsity

        """

        P = tf.reduce_sum(y_true)

        N = -1 * tf.reduce_sum(y_true - 1)

 

        beta_P = tf.cast((P + N) / P, dtype=tf.float64)

        beta_N = tf.cast((P + N) / N, dtype=tf.float64)

 

        y_true = tf.cast(y_true, dtype=tf.float64)

        y_pred = tf.cast(y_pred, dtype=tf.float64)

 

        epsilon = tf.constant(1e-7, dtype=tf.float64) #avoid nans

        loss = (beta_P*tf.math.log(y_pred+epsilon)*y_true + beta_N*tf.math.log((1-y_pred)+epsilon) * (1-y_true))*-1.0

        tf.debugging.assert_all_finite(loss, 'There are nan values')

        return tf.reduce_sum(tf.reduce_mean(loss, axis = 0)*self.inverse_class_weights)

 



In [None]:
# function to plot accuracy and loss

import matplotlib.pyplot as plt
%matplotlib inline

def plot_acc_loss(history):
    
    plt.figure(1)  

    # summarize history for accuracy  

    plt.subplot(211)  
    plt.plot(history.history['precision'])  
    plt.plot(history.history['val_precision'])  
    plt.title('PRECISION')  
    plt.ylabel('Precision ratio')  
    plt.xlabel('epoch')  
    plt.legend(['train', 'val'], loc='upper left')  

    # summarize history for loss  

    plt.subplot(212)  
    plt.plot(history.history['loss'])  
    plt.plot(history.history['val_loss'])  
    plt.title('model loss')  
    plt.ylabel('loss')  
    plt.xlabel('epoch')  
    plt.legend(['train', 'val'], loc='upper left')

    plt.tight_layout()
    plt.show() 
    
    return

In [None]:


#x_train, x_val, y_train, y_val = train_test_split(x,y, test_size =0.2, random_state=42)

# set the paramters for the network
batch_size = 32
epochs = 50
n_hidden_units = 128
n_hidden_layers = 6

# engineer the learning rate so that it works with floats as randrange only works with ints
lr = .002

# way to one hot encode the target variable into categorical variables
y_train = keras.utils.to_categorical(y_bal_train, len(np.unique(y)))
y_val = keras.utils.to_categorical(y_test, len(np.unique(y)))


# need to do [1:] to allow for dimensionality compatability
a = b = keras.layers.Input(shape = x_bal_train.shape[1:])

print('****** With Normalization Layer *******')
a = keras.layers.LayerNormalization(axis= -1)(a)

a = keras.layers.Flatten()(a)


print('Residual True')
a = keras.layers.Dense(n_hidden_units)(a)
for _ in range(n_hidden_layers):
    a_resid = a
    a = keras.layers.Dense(n_hidden_units, activation=keras.activations.relu)(a)
    a = keras.layers.Add()([a, a_resid])

# look at keras.activations options
# make sure to functionalize the layer object so we pass tensors
a = keras.layers.Dense(y_train.shape[1], activation = keras.activations.softmax)(a)

# reset the model
model = keras.Model(b,a)

# Prep the model for -learning-
model.compile(loss=keras.losses.BinaryCrossentropy(from_logits=True),
optimizer='adam',
metrics=[keras.metrics.Precision(thresholds=0)])

# urn off print because we will look at the best model
history = model.fit(x_bal_train, y_train,
    batch_size = batch_size,
    epochs = epochs,
    verbose = 2,
    validation_data = (x_test, y_val)
            )
# don't need to print out each evaluation
score = model.evaluate(x_test, y_val, verbose = 1)

plot_acc_loss(history)

print(score)

In [None]:
from sklearn import metrics

threshold = 0.45


output = model.predict(np.asarray(x_test).astype('float32'))

output[output[:,1] > threshold] = [0,1]


actual = np.argmax(y_val, axis=-1)
predicted = np.argmax(output, axis=-1)

incorrect = np.where(np.argmax(output,axis=-1) != np.argmax(y_val, axis=-1))
correct = np.where(np.argmax(output,axis=-1) == np.argmax(y_val, axis=-1))
print("Accuracy of predictions: ",round((np.size(correct) / (np.size(incorrect) + np.size(correct))) * 100,2))

confusion_matrix = metrics.confusion_matrix(actual, predicted)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True])

cm_display.plot()
plt.show()



In [None]:


def create_and_test_model(data, learning_rate_range_start, learning_rate_range_end, learning_rate_step
                          , val_accuracy_thresh, batch_size_range_start
                          , batch_size_range_end, batch_size_step, epoch_range_start
                          , epoch_range_end, epoch_range_step, iter_size, max_score, min_loss
                          , best_batch_size, best_num_epochs, best_lr, best_model, best_history, normalize = False):
    
    # take the data and split it into training and validation sets with proper encodings

    y = data['Attrition']
    x = data.drop(columns='Attrition')

    y.replace('No', 0, inplace=True)
    y.replace('Yes', 1, inplace = True)

    x_train, x_val, y_train, y_val = train_test_split(x,y, test_size =0.2, random_state=42)

    # initialize the counter for the iterations, and the list to store the data   
    iteration = 0
    data_list = []

    # create loop to go through a large ranges of batch_size and epochs combinations to find the optimal combo
    while (max_score < val_accuracy_thresh) and (iteration < iter_size):


        # need to do [1:] to allow for dimensionality compatability
        a = b = keras.layers.Input(shape = x_train.shape[1:])
        
        if normalize:
            print('****** With Normalization Layer *******')
            b = keras.layers.LayerNormalization(axis= -1)(b)

        # look at keras.activations options
        # make sure to functionalize the layer object so we pass tensors
        b = keras.layers.Dense(1, activation = keras.activations.softmax)(b)
    
        # utilizing stochasisity to help find the best hyper parameters will be quicker than iterating through
        batch_size = rand.randrange(batch_size_range_start, batch_size_range_end, batch_size_step)
        epochs = rand.randrange(epoch_range_start, epoch_range_end, epoch_range_step)

        # engineer the learning rate so that it works with floats as randrange only works with ints
        lr = np.random.choice(np.arange(learning_rate_range_start, learning_rate_range_end, learning_rate_step), size=1)[0]
        print("batch size : ", batch_size, "...  epochs :", epochs,"... learning rate:" , lr,"...  iteration:", iteration)

        # reset the model
        model = keras.Model(a,b)

        # Prep the model for -learning-
        model.compile(loss=keras.losses.BinaryCrossentropy(log_ints=True),
        optimizer=keras.optimizers.SGD(learning_rate=lr),
        metrics=[keras.metrics.Recall(thresholds=0)])

        # urn off print because we will look at the best model
        history = model.fit(x_train, y_train,
            batch_size = batch_size,
            epochs = epochs,
            verbose = 2,
            validation_data = (x_val, y_val)
            )
        # don't need to print out each evaluation
        score = model.evaluate(x_val, y_val, verbose = 0)
        print("acc_score: ", score[1], "loss_value: ", score[0])

        if (score[1] > max_score):
            max_score = score[1]
            min_loss = score[0]
            best_batch_size = batch_size
            best_num_epochs = epochs
            best_lr = lr
            # make sure to use deepcopy so we get the object not a reference
            best_model = copy.deepcopy(model)
            best_history = copy.deepcopy(history)
        
        # increment the iteration
        iteration += 1
        
        row = [score[1], score[0], batch_size, epochs, lr]
        data_list.append(row)

    # create the dataframe with the data_list
    df = pd.DataFrame(data_list, columns=['score', 'loss', 'batch_size', 'epochs', 'learning_rate'])

    return df, max_score, min_loss, best_batch_size, best_num_epochs, best_lr, best_model, best_history

In [None]:
#df, max_score, min_loss, best_batch_score, best_num_epochs, best_lr,best_model, best_history = create_and_test_model(data, 0.005, 0.011, 0.001, 0.90, 2, 20, 2, 50, 300, 25, 5, 0, 100, 0, 0, 0, None, None, normalize = False)