## Part 1: Preprocessing

In [77]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder ,OneHotEncoder

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [78]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [79]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition','Department']]

y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [80]:
# Create a list of at least 10 column names to use as X data
feature_columns = ['EnvironmentSatisfaction','JobInvolvement','JobSatisfaction','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction',
                     'StockOptionLevel','TrainingTimesLastYear','WorkLifeBalance','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

# Create X_df using your selected columns
X_df = attrition_df[feature_columns]

# Display the DataFrame
display(X_df.head())

# Show the data types for X_df
X_df.info()
# all features are integers

Unnamed: 0,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,2,3,4,11,3,1,0,0,1,4,0,5
1,3,2,2,23,4,4,1,3,3,7,1,7
2,4,2,3,15,3,2,0,3,3,0,0,0
3,4,3,3,11,3,3,0,3,3,7,3,0
4,1,3,2,12,3,4,1,3,3,2,2,2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   EnvironmentSatisfaction   1470 non-null   int64
 1   JobInvolvement            1470 non-null   int64
 2   JobSatisfaction           1470 non-null   int64
 3   PercentSalaryHike         1470 non-null   int64
 4   PerformanceRating         1470 non-null   int64
 5   RelationshipSatisfaction  1470 non-null   int64
 6   StockOptionLevel          1470 non-null   int64
 7   TrainingTimesLastYear     1470 non-null   int64
 8   WorkLifeBalance           1470 non-null   int64
 9   YearsInCurrentRole        1470 non-null   int64
 10  YearsSinceLastPromotion   1470 non-null   int64
 11  YearsWithCurrManager      1470 non-null   int64
dtypes: int64(12)
memory usage: 137.9 KB


In [81]:
# Split the data into training and testing sets
#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=99)



In [82]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Happily all these data fields are already numeric, but scaling is called for

display(X_train.head())
display(X_test.head())

y_train.value_counts('Attrition')

Unnamed: 0,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
248,3,2,1,14,3,1,1,2,2,1,0,2
1073,3,1,2,14,3,2,2,2,2,6,1,7
787,4,3,2,18,3,2,1,4,3,2,1,2
305,2,3,2,15,3,3,1,3,3,8,0,8
1096,3,3,4,21,4,4,0,2,3,7,7,7


Unnamed: 0,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1328,2,2,3,21,4,4,1,5,3,8,5,8
725,3,2,2,24,4,4,1,2,1,2,0,2
74,2,4,4,12,3,2,0,3,3,0,0,0
1132,4,3,1,15,3,3,1,2,3,4,1,2
1042,3,2,3,14,3,4,0,5,3,2,0,3


Unnamed: 0_level_0,count
Attrition,Unnamed: 1_level_1
No,924
Yes,178


In [83]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

X_train_scaled

array([[ 0.2546282 , -1.00109601, -1.57985629, ..., -0.88786305,
        -0.68759869, -0.57997249],
       [ 0.2546282 , -2.40466828, -0.66450317, ...,  0.49292312,
        -0.38032628,  0.8265629 ],
       [ 1.1627197 ,  0.40247626, -0.66450317, ..., -0.61170581,
        -0.38032628, -0.57997249],
       ...,
       [ 0.2546282 , -2.40466828,  1.16620306, ..., -1.16402028,
        -0.68759869, -1.14258664],
       [ 1.1627197 ,  0.40247626,  0.25084995, ..., -1.16402028,
        -0.68759869, -1.14258664],
       [-0.6534633 ,  0.40247626, -0.66450317, ...,  1.04523759,
         0.84876334, -0.29866541]])

In [84]:
# Create a OneHotEncoder for the Department column
department_ohe = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
department_ohe.fit(y_train['Department'].values.reshape(-1,1))

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded_department = department_ohe.transform(y_train['Department'].values.reshape(-1,1))
y_test_encoded_department = department_ohe.transform(y_test['Department'].values.reshape(-1,1))

y_test_encoded_department[:5]


array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

## Create, Compile, and Train the Model

In [85]:
# Create a OneHotEncoder for the Attrition column
attrition_ohe = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
attrition_ohe.fit(y_train['Attrition'].values.reshape(-1,1))

# Create two new variables by applying the encoder
# to the training and testing data
y_train_encoded_attrition = attrition_ohe.transform(y_train['Attrition'].values.reshape(-1,1))
y_test_encoded_attrition = attrition_ohe.transform(y_test['Attrition'].values.reshape(-1,1))

y_test_encoded_attrition[:5]


array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [1., 0.]])

In [86]:
# Find the number of columns in the X training data
display(X_train_scaled.shape)
display(X_test_scaled.shape)
# 12 columns

input_layer = layers.Input(shape=(X_train_scaled.shape[1],), name='input_features')

# Shared hidden layers
shared_layer1 = layers.Dense(64, activation='relu')(input_layer)
shared_layer2 = layers.Dense(32, activation='relu')(shared_layer1)


(1102, 12)

(368, 12)

In [87]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_layer = layers.Dense(32, activation='relu')(shared_layer2)

# Create the output layer
department_output = layers.Dense(3, activation='softmax', name='department_output')(department_layer)



In [88]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_layer = layers.Dense(32, activation='relu')(shared_layer2)

# Create the output layer, changed number of units to 2 to match y_train_encoded_attrition
attrition_output = layers.Dense(2, activation='sigmoid', name='attrition_output')(attrition_layer)


In [89]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output,attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'categorical_crossentropy'}, # Changed to categorical_crossentropy
              metrics={'department_output': 'accuracy',
                       'attrition_output': tf.keras.metrics.AUC(name='auc')}) # Changed 'AUC-ROC' to tf.keras.metrics.AUC

# Summarize the model
model.summary()

Model: "model_12"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_features (InputLayer  [(None, 12)]                 0         []                            
 )                                                                                                
                                                                                                  
 dense_14 (Dense)            (None, 64)                   832       ['input_features[0][0]']      
                                                                                                  
 dense_15 (Dense)            (None, 32)                   2080      ['dense_14[0][0]']            
                                                                                                  
 dense_16 (Dense)            (None, 32)                   1056      ['dense_15[0][0]']     

In [90]:
# Train the model

print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of y_train_encoded_department:", y_train_encoded_department.shape)
print("Shape of y_train_encoded_attrition:", y_train_encoded_attrition.shape)

model.fit(
    X_train_scaled,
    {'department_output': y_train_encoded_department,
     'attrition_output': y_train_encoded_attrition},
    epochs=20,
    batch_size=32,
    validation_split=0.2
)


Shape of X_train_scaled: (1102, 12)
Shape of y_train_encoded_department: (1102, 3)
Shape of y_train_encoded_attrition: (1102, 2)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7c61b2d63ee0>

In [91]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled, {'department_output': y_test_encoded_department, 'attrition_output': y_test_encoded_attrition})
test_results



[1.2840155363082886,
 0.8379517197608948,
 0.44606369733810425,
 0.60326087474823,
 0.8773592710494995]

In [92]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {test_results[3]}")
print(f"Attrition Accuracy: {test_results[4]}")

Department Accuracy: 0.60326087474823
Attrition Accuracy: 0.8773592710494995


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?


YOUR ANSWERS HERE

1. There is a significant difference in the number of people in each department.  That makes _accuracy_ a poor metric because the model will not learn to identify instances of the less frequent value, in this case "HR".  I tried using the 'area under the ROC curve' instead of accuracy, and I did get a significant improvement (.66 for accuracy vs. .88 for AUC-ROC)
2. The department output had several possible outcomes, so softmax is appropriate.  But the attrition output is binary - either they left or they stayed - so a better activation function is sigmoid or tanh.
3. Additional tuning would help as would a more complete exploration of hypertuning.  
More epochs would have helped - the loss function was still dropping after the 10th (and 20th) epoch, suggesting that more epochs could have improved results.
Finally, choosing a different feature set might have been more effective at predicting the correct results.  