## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [4]:
# Create a list of at least 10 column names to use as X data
columns = ["Education", "Age", "DistanceFromHome", "JobSatisfaction", "OverTime","StockOptionLevel", "WorkLifeBalance", "YearsAtCompany", "YearsSinceLastPromotion", "NumCompaniesWorked"  ]

# Create X_df using your selected columns
X_df = attrition_df[[column for column in columns]]

# Show the data types for X_df
X_df.dtypes


Unnamed: 0,0
Education,int64
Age,int64
DistanceFromHome,int64
JobSatisfaction,int64
OverTime,object
StockOptionLevel,int64
WorkLifeBalance,int64
YearsAtCompany,int64
YearsSinceLastPromotion,int64
NumCompaniesWorked,int64


In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=1)

In [6]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
print(X_train['OverTime'].value_counts())
print(X_test['OverTime'].value_counts())

OverTime
No     798
Yes    304
Name: count, dtype: int64
OverTime
No     256
Yes    112
Name: count, dtype: int64


In [7]:
X_test.head()

Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,OverTime,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked
1291,4,37,10,1,No,0,1,10,0,1
1153,2,18,3,4,Yes,0,4,0,0,1
720,3,30,22,3,Yes,0,3,5,0,4
763,4,34,10,3,Yes,1,3,1,0,1
976,3,56,23,2,Yes,1,3,19,15,4


In [8]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Apply label encoding to the 'OverTime' column
X_train['OverTime'] = le.fit_transform(X_train['OverTime'])
X_test['OverTime'] = le.transform(X_test['OverTime'])

# Check the result
print(X_train['OverTime'].value_counts())

OverTime
0    798
1    304
Name: count, dtype: int64


In [9]:
# Check data types of all columns
print(X_train.dtypes)

# Convert any object columns to numeric if they contain numeric data
for column in X_df.columns:
    if X_df[column].dtype == 'object':
        try:
            X_df[column] = pd.to_numeric(X_df[column])
        except ValueError:
            print(f"Could not convert {column} to numeric")

# Check data types again
print(X_train.dtypes)

Education                  int64
Age                        int64
DistanceFromHome           int64
JobSatisfaction            int64
OverTime                   int64
StockOptionLevel           int64
WorkLifeBalance            int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
NumCompaniesWorked         int64
dtype: object
Could not convert OverTime to numeric
Education                  int64
Age                        int64
DistanceFromHome           int64
JobSatisfaction            int64
OverTime                   int64
StockOptionLevel           int64
WorkLifeBalance            int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
NumCompaniesWorked         int64
dtype: object


In [10]:
# Create a StandardScaler
ss = StandardScaler()

# Fit the StandardScaler to the training data
ss.fit(X_train)

# Scale the training and testing data
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)


In [11]:
# Create a OneHotEncoder for the Department column
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
ohe.fit(y_train[['Department']])

# Create two new variables by applying the encoder
# to the training and testing data
department_train_encoded = ohe.transform(y_train[['Department']])
department_test_encoded = ohe.transform(y_test[['Department']])

department_train_encoded

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [12]:
# Create a OneHotEncoder for the Attrition column
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder to the training data
ohe.fit(y_train[['Attrition']])

# Create two new variables by applying the encoder
# to the training and testing data
attrition_train_encoded = ohe.transform(y_train[['Attrition']])
attrition_test_encoded = ohe.transform(y_test[['Attrition']])

attrition_train_encoded

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]])

## Create, Compile, and Train the Model

In [13]:
# Find the number of columns in the X training data
cols = X_train_scaled.shape[1]

# Create the input layer
input_layer = layers.Input(shape=(cols,))

# Create at least two shared layers
shared1 = layers.Dense(units=64, activation='relu')(input_layer)
shared2 = layers.Dense(units=128, activation='relu')(shared1)

In [14]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden = layers.Dense(units=32, activation='relu')(shared2)

# Create the output layer
department_output = layers.Dense(units=3, activation='softmax', name='department_output')(department_hidden)


In [15]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = layers.Dense(units=32, activation='relu')(shared2)

# Create the output layer
attrition_output = layers.Dense(units=2, activation='softmax', name='attrition_output')(attrition_hidden)


In [16]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
# Compile the model
# Compile the model with separate losses and metrics for each output
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'categorical_crossentropy'},
              metrics={'department_output': 'accuracy',
                       'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [17]:
# Train the model
model.fit(X_train_scaled, [department_train_encoded, attrition_train_encoded], epochs=100, batch_size=32)


Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - attrition_output_accuracy: 0.6950 - department_output_accuracy: 0.3907 - loss: 1.6296
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - attrition_output_accuracy: 0.8386 - department_output_accuracy: 0.6551 - loss: 1.2120
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - attrition_output_accuracy: 0.8557 - department_output_accuracy: 0.6542 - loss: 1.1653
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8578 - department_output_accuracy: 0.6460 - loss: 1.1319
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8669 - department_output_accuracy: 0.6404 - loss: 1.1299
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8539 - department_o

<keras.src.callbacks.history.History at 0x7d0bf05e9c30>

In [18]:
# Evaluate the model with the testing data
model_loss, department_accuracy, attrition_accuracy = model.evaluate(X_test_scaled, [department_test_encoded, attrition_test_encoded])

# Print the results
print(f"Department Accuracy: {department_accuracy}")

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.7349 - department_output_accuracy: 0.5191 - loss: 5.2014  
Department Accuracy: 0.76902174949646


In [19]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {department_accuracy}")
print(f"Attrition Accuracy: {attrition_accuracy}")

Department Accuracy: 0.76902174949646
Attrition Accuracy: 0.508152186870575


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Accuracy may not always be the best metric, especially if the dataset is imbalanced (i.e., one class is much more frequent than others). For example, if the majority of employees in an attrition dataset are "not leaving" (class imbalance), accuracy might be misleading. In such cases, metrics like precision, recall, and F1-score are more informative, as they focus on the performance related to specific classes.
2. I used softmax for both output layers because:

  - The department output is a multiclass classification problem (e.g., predicting which department an employee belongs to), where softmax is useful for providing a probability distribution over multiple classes.
  - The attrition output is a binary classification problem (e.g., predicting whether an employee will leave or stay), where softmax works well to predict the probability for the two classes.
3.
  - Feature engineering: Adding new, meaningful features (e.g., interaction terms, ratios) could improve model performance.
  - Regularization: Techniques like dropout or L2 regularization can be applied to prevent overfitting.
  - Hyperparameter tuning: Experimenting with different hyperparameters (e.g., learning rate, batch size) could improve the model’s performance.
  - Model complexity: Adjusting the architecture, such as increasing/decreasing the number of layers or neurons in hidden layers, could provide better results.