# Module 19 Challenge - Human Resources Employee Retention Tools

##The objective of this script is to create a neural network that HR can use to predict whether employees are likely to leave the company, and identify if an employee is better suited to another department.

## Part 1a.  Pre-Processing - Clean Data for Separation of Training and Test Set

In [20]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, concatenate

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [4]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [5]:
attrition_df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EnvironmentSatisfaction      int64
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
NumCompaniesWorked           int64
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [6]:
# Preprocess y
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Preprocess columns with dtype = object

# Preprocess BINARY column using label encoding
attrition_encoder = LabelEncoder()
attrition_df['Attrition'] = attrition_encoder.fit_transform(attrition_df['Attrition'])

# Preprocess, one-at-a-time, CATEGORY columns (multiple categores) using one-hot encoding
OverTime_encoder = OneHotEncoder(sparse_output=False)
OverTime_encoded = OverTime_encoder.fit_transform(attrition_df[['OverTime']])
OverTime_columns = OverTime_encoder.get_feature_names_out(['OverTime'])
df_OverTime_encoded = pd.DataFrame(OverTime_encoded, columns=OverTime_columns)

BusinessTravel_encoder = OneHotEncoder(sparse_output=False)
BusinessTravel_encoded = BusinessTravel_encoder.fit_transform(attrition_df[['BusinessTravel']])
BusinessTravel_columns = BusinessTravel_encoder.get_feature_names_out(['BusinessTravel'])
df_BusinessTravel_encoded = pd.DataFrame(BusinessTravel_encoded, columns=BusinessTravel_columns)

Department_encoder = OneHotEncoder(sparse_output=False)
Department_encoded = Department_encoder.fit_transform(attrition_df[['Department']])
Department_columns = Department_encoder.get_feature_names_out(['Department'])
df_Department_encoded = pd.DataFrame(Department_encoded, columns=Department_columns)

EducationField_encoder = OneHotEncoder(sparse_output=False)
EducationField_encoded = EducationField_encoder.fit_transform(attrition_df[['EducationField']])
EducationField_columns = EducationField_encoder.get_feature_names_out(['EducationField'])
df_EducationField_encoded = pd.DataFrame(EducationField_encoded, columns=EducationField_columns)

JobRole_encoder = OneHotEncoder(sparse_output=False)
JobRole_encoded = JobRole_encoder.fit_transform(attrition_df[['JobRole']])
JobRole_columns = JobRole_encoder.get_feature_names_out(['JobRole'])
df_JobRole_encoded = pd.DataFrame(JobRole_encoded, columns=JobRole_columns)

MaritalStatus_encoder = OneHotEncoder(sparse_output=False)
MaritalStatus_encoded = MaritalStatus_encoder.fit_transform(attrition_df[['MaritalStatus']])
MaritalStatus_columns = MaritalStatus_encoder.get_feature_names_out(['MaritalStatus'])
df_MaritalStatus_encoded = pd.DataFrame(MaritalStatus_encoded, columns=MaritalStatus_columns)

# Concatenate the encoded columns to the original DataFrame
df_processed = pd.concat([attrition_df, df_OverTime_encoded, df_BusinessTravel_encoded, df_Department_encoded, df_EducationField_encoded, df_JobRole_encoded, df_MaritalStatus_encoded], axis=1)

# Drop the original "quality" and "color" columns
df_processed = df_processed.drop(['OverTime', 'BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus'], axis=1)

df_processed.head()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,1,1,2,2,94,3,2,4,8,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,0,8,1,3,61,2,2,2,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,37,1,2,2,4,92,2,1,3,6,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,33,0,3,4,4,56,3,1,3,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,27,0,2,1,1,40,3,1,2,9,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
df_processed.dtypes

Age                                    int64
Attrition                              int64
DistanceFromHome                       int64
Education                              int64
EnvironmentSatisfaction                int64
HourlyRate                             int64
JobInvolvement                         int64
JobLevel                               int64
JobSatisfaction                        int64
NumCompaniesWorked                     int64
PercentSalaryHike                      int64
PerformanceRating                      int64
RelationshipSatisfaction               int64
StockOptionLevel                       int64
TotalWorkingYears                      int64
TrainingTimesLastYear                  int64
WorkLifeBalance                        int64
YearsAtCompany                         int64
YearsInCurrentRole                     int64
YearsSinceLastPromotion                int64
YearsWithCurrManager                   int64
OverTime_No                          float64
OverTime_Y

In [None]:
# Preprocess, one-at-a-time, CATEGORY columns (multiple categores) using one-hot encoding
# overtime_encoder = OneHotEncoder(sparse_output=False)
# overtime_encoded = overtime_encoder.fit_transform(attrition_df[['OverTime']])
# overtime_columns = overtime_encoder.get_feature_names_out(['OverTime'])
# df_overtime_encoded = pd.DataFrame(overtime_encoded, columns=overtime_columns)

# # Automate the process of one-hot encoding multiple columns
# # Start with making a list of columns to be one-hot encoded
# columns_list = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus', 'OverTime']

# # Convert the list into a DataFrame
# selected_columns_df = attrition_df[columns_list]

# # Create a loop to iterate through the list, and apply one-hot encoding as we go
# for i in columns_list:
#     category_encoder = OneHotEncoder(sparse_output=False)
#     category_encoded = category_encoder.fit_transform(i)
#     category_columns = category_encoder.get_feature_names_out(i)
#     df_auto_category_encoded = pd.DataFrame(category_encoded, columns=selected_columns_df)

## Part 1b.  Pre-Processing - Separation of Training and Test Set

Extra: Rank the features by the number of unique values, in descending order.  The resulting DataFrame will be used to create a list of 10 features to model.

In [8]:
# create DataFrame of feature names and their number of unique values
ranked_features_df = pd.DataFrame(attrition_df.nunique())

# reset index
ranked_features_df = ranked_features_df.reset_index()

# go for clean column names
ranked_features_df.rename(columns={'index': 'feature', 0: 'n_unique'}, inplace=True)

# sort rows by number of unique values
ranked_features_df.sort_values('n_unique', ascending=False, inplace=True)

# reset index
ranked_features_df = ranked_features_df.reset_index()

# drop unnecessary columns
ranked_features_df.drop('index', axis=1, inplace=True)

# # display df
ranked_features_df

Unnamed: 0,feature,n_unique
0,HourlyRate,71
1,Age,43
2,TotalWorkingYears,40
3,YearsAtCompany,37
4,DistanceFromHome,29
5,YearsInCurrentRole,19
6,YearsWithCurrManager,18
7,YearsSinceLastPromotion,16
8,PercentSalaryHike,15
9,NumCompaniesWorked,10


In [9]:
# Create a list of at least 10 column names to use as X data
selected_columns = ranked_features_df.head()['feature'].tolist() # use head() to get top 5
selected_columns.extend(ranked_features_df.tail()['feature'].tolist()) # use tail() to get bottom 5 and .append() to add to list
# selected_columns

# Create X_df using your selected columns
selected_columns_df = pd.DataFrame(selected_columns, columns=['feature'])
selected_columns_df

Unnamed: 0,feature
0,HourlyRate
1,Age
2,TotalWorkingYears
3,YearsAtCompany
4,DistanceFromHome
5,BusinessTravel
6,MaritalStatus
7,OverTime
8,Attrition
9,PerformanceRating


In [10]:
# df_processed.head()
# df_processed.info()
# df_processed.describe()
print(df_processed.columns)

Index(['Age', 'Attrition', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'OverTime_No', 'OverTime_Yes',
       'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'JobRole_Healthcare Representative', 'JobRole_Human Resources',
       'JobRole_Laboratory Techni

In [11]:
# Split data into X and two separate y variables
X = df_processed.drop(columns=['Attrition', 'Department_Human Resources', 'Department_Research & Development', 'Department_Sales'])
y_attrition = df_processed[['Attrition']]
y_department = df_processed[['Attrition', 'Department_Human Resources', 'Department_Research & Development', 'Department_Sales']]

# View X_df
X.head()

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,PercentSalaryHike,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,1,2,2,94,3,2,4,8,11,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49,8,1,3,61,2,2,2,1,23,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,37,2,2,4,92,2,1,3,6,15,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,33,3,4,4,56,3,1,3,1,11,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,27,2,1,1,40,3,1,2,9,12,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [32]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the features and targets into training and testing sets
X_train_attrition, X_test_attrition, y_attrition_train, y_attrition_test = train_test_split(X, y_attrition, test_size=0.2, random_state=42)
X_train_department, X_test_department, y_department_train, y_department_test = train_test_split(X, y_department, test_size=0.2, random_state=42)

In [13]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Already done above

In [33]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data (only features)
scaler.fit(X_train_attrition) # Fits scaler to the data. No need to fit again for X_train_for_department if it's the same data

# Scale the training and testing feature data for attrition
X_train_attrition_scaled = scaler.transform(X_train_attrition)
X_test_attrition_scaled = scaler.transform(X_test_attrition)

# Scale the training and testing feature data for department
# Note: This step is technically redundant if X_train_for_attrition == X_train_for_department
# and X_test_for_attrition == X_test_for_department, as the data is the same.
X_train_department_scaled = scaler.transform(X_train_department)
X_test_department_scaled = scaler.transform(X_test_department)

In [None]:
# Create a OneHotEncoder for the Department column


# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data




In [None]:
# Create a OneHotEncoder for the Attrition column


# Fit the encoder to the training data


# Create two new variables by applying the encoder
# to the training and testing data



## Create, Compile, and Train the Model

In [24]:
from tensorflow.keras import layers

In [34]:
# Find the number of columns in the X training data
num_columns = X_train_attrition_scaled.shape[1]
print("Number of columns:", num_columns)

# Create the input layer based on the number of features (columns)
input_layer = layers.Input(shape=(num_columns,), name='input_features')

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer2 = layers.Dense(32, activation='relu')(shared_layer1)

Number of columns: 43


In [26]:
# Create the Department branch of the model. '3' is the correct number of unique departments after one-hot encoding.
department_input = Input(shape=(3,), name='department_input')

# Hidden layer for the Department branch
department_hidden = layers.Dense(32, activation='relu', name='department_hidden')(department_input)

# Output layer for the Department branch
department_output = layers.Dense(1, activation='sigmoid', name='department_output')(department_hidden)

In [35]:
# Create the Attrition branch of the model. '3' is the correct number of unique departments after one-hot encoding.
attrition_input = Input(shape=(2,), name='attrition_input')

# Hidden layer for the Attrition branch
attrition_hidden = layers.Dense(16, activation='relu', name='attrition_hidden')(attrition_input)

# Output layer for the Attrition branch
attrition_output = layers.Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden)

In [36]:
# Create the model
# Ensure that 'input_layer' is meant to be replaced or used alongside 'department_input'
model = Model(inputs=department_input, outputs=department_output)

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Summarize the model
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 department_input (InputLay  [(None, 3)]               0         
 er)                                                             
                                                                 
 department_hidden (Dense)   (None, 32)                128       
                                                                 
 department_output (Dense)   (None, 1)                 33        
                                                                 
Total params: 161 (644.00 Byte)
Trainable params: 161 (644.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [37]:
# Train the model
history = model.fit(X_train_department_scaled, y_department_train,
                    validation_split=0.2,  # Optionally, set aside a portion of the training data for validation
                    epochs=10,             # The number of epochs (iterations over the entire dataset) to train for
                    batch_size=32,         # The number of samples per gradient update for training
                    verbose=1)             # Verbosity mode (0 = silent, 1 = progress bar, 2 = one line per epoch)


Epoch 1/10


ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1150, in train_step
        y_pred = self(x, training=True)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_2" is incompatible with the layer: expected shape=(None, 3), found shape=(None, 43)


In [None]:
# Evaluate the model with the testing data


In [None]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1.
2.
3.