In [2]:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
# MLP = multi-layer perceptron.

In [3]:
# The target feature is whether or not the employee left.
TARGET_FEATURE = 'left'  # Valid data values are 0 or 1.

# We'll set aside 20% of the data to test the model.
TEST_SET_SIZE = 0.2

# We need to know which features are categorical.
CATEGORICAL_FEATURES = ['sales', 'salary']

In [5]:
# This assumes the data is in the same directory as this script.
# Here we load the data into a pandas DataFrame.
raw_data = pd.read_csv('HR_comma_sep.csv')

# It's helpful to take a quick look at the data.
print('Sample of loaded data:')
print(raw_data.sample(5))
print('')
print('Count per value (0 or 1) of the target feature:')
print(raw_data[TARGET_FEATURE].value_counts())
print('')

Sample of loaded data:
       satisfaction_level  last_evaluation  number_project  \
2991                 0.90             0.77               3   
4838                 0.46             0.38               6   
4068                 0.60             0.49               2   
14228                0.78             0.99               4   
9054                 0.64             0.77               3   

       average_montly_hours  time_spend_company  Work_accident  left  \
2991                    156                   3              0     0   
4838                    165                   3              0     0   
4068                    194                   4              0     0   
14228                   255                   6              0     1   
9054                    249                   2              1     0   

       promotion_last_5years       sales  salary  
2991                       0       sales  medium  
4838                       0  accounting     low  
4068              

In [7]:
# Step 4: Set up the data.
# ~~~~~~~~~~~~~~~~~~~~~~~~

# Separate the X and Y values.
y_data = raw_data[TARGET_FEATURE]

# Using drop() doesn't change raw_data, only the return value.
# The axis=1 keyword tells pandas to drop a column (not a row).
x_data = raw_data.drop(TARGET_FEATURE, axis=1)

# To include an intercept, add a new column with a constant.
x_data['intercept'] = 1.0

# Turn categorical variables into dummy columns (0 or 1 values).
# Do this to avoid assuming a meaningful order of categories.
# Use drop_first to avoid multicollinearity among features.
x_data = pd.get_dummies(
    x_data,
    columns=CATEGORICAL_FEATURES,
    drop_first=True
)

# It's helpful to double check that the final data looks good.
print('Sample of data to use:')
print(x_data.sample(5))
print('')

# Split the data into training and test sets.
x_train, x_test, y_train, y_test = train_test_split(
    x_data,
    y_data,
    test_size=TEST_SET_SIZE
)



Sample of data to use:
       satisfaction_level  last_evaluation  number_project  \
26                   0.82             0.87               4   
4053                 0.92             0.67               4   
4154                 0.98             0.66               3   
7972                 0.53             0.70               4   
12175                0.54             0.74               4   

       average_montly_hours  time_spend_company  Work_accident  \
26                      239                   5              0   
4053                    161                   4              1   
4154                    150                   3              0   
7972                    243                   3              0   
12175                   164                   2              0   

       promotion_last_5years  intercept  sales_RandD  sales_accounting  \
26                         0        1.0            0                 0   
4053                       0        1.0            0       

In [9]:

# Step 5: Fit the model.
# ~~~~~~~~~~~~~~~~~~~~~~

model = MLPClassifier().fit(x_train, y_train)

# Yes, that's it!




In [None]:

# Get the predicted target (y) values.
y_predict = model.predict(x_test)

# Get the confusion matrix and calculate the results.
#   M[i][j] = #cases with known value i and predicted value j.
M = confusion_matrix(y_test, y_predict)
n_samples = len(y_test)
print('Accuracy:  %.2f' % ((M[0][0] + M[1][1]) / n_samples))
print('Precision: %.2f' % (M[1][1] / (M[0][1] + M[1][1])))
print('Recall:    %.2f' % (M[1][1] / (M[1][0] + M[1][1])))

Useful resources:
- https://pyimagesearch.com/2021/05/06/implementing-feedforward-neural-networks-with-keras-and-tensorflow/
- https://hackernoon.com/building-a-feedforward-neural-network-from-scratch-in-python-d3526457156b
- https://stackabuse.com/introduction-to-neural-networks-with-scikit-learn/