<a href="https://colab.research.google.com/github/maddiejane25/neural-network-challenge-2/blob/main/attrition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part 1: Preprocessing

In [25]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [26]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [27]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [28]:
# Create a list of at least 10 column names to use as X data
X_data = ['Age', 'DistanceFromHome', 'Education', 'JobSatisfaction', 'OverTime', 'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'YearsSinceLastPromotion', 'NumCompaniesWorked']

# Create X_df using your selected columns
X_df = attrition_df[X_data]

# Show the data types for X_df
X_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   DistanceFromHome         1470 non-null   int64 
 2   Education                1470 non-null   int64 
 3   JobSatisfaction          1470 non-null   int64 
 4   OverTime                 1470 non-null   object
 5   StockOptionLevel         1470 non-null   int64 
 6   WorkLifeBalance          1470 non-null   int64 
 7   YearsAtCompany           1470 non-null   int64 
 8   YearsSinceLastPromotion  1470 non-null   int64 
 9   NumCompaniesWorked       1470 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 115.0+ KB


In [29]:
X_df.head()

Unnamed: 0,Age,DistanceFromHome,Education,JobSatisfaction,OverTime,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked
0,41,1,2,4,Yes,0,1,6,0,8
1,49,8,1,2,No,1,3,10,1,1
2,37,2,2,3,Yes,0,3,0,0,6
3,33,3,4,3,Yes,0,3,8,3,1
4,27,2,1,2,No,1,3,2,2,9


In [31]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=1)


In [32]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# Convert the OverTime column into int64 where 1 is yes
X_train['OverTime'] = X_train['OverTime'].replace({'Yes': 1, 'No': 0}).astype('int64')
X_test['OverTime'] = X_test['OverTime'].replace({'Yes': 1, 'No': 0}).astype('int64')
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1102 entries, 464 to 1061
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   Age                      1102 non-null   int64
 1   DistanceFromHome         1102 non-null   int64
 2   Education                1102 non-null   int64
 3   JobSatisfaction          1102 non-null   int64
 4   OverTime                 1102 non-null   int64
 5   StockOptionLevel         1102 non-null   int64
 6   WorkLifeBalance          1102 non-null   int64
 7   YearsAtCompany           1102 non-null   int64
 8   YearsSinceLastPromotion  1102 non-null   int64
 9   NumCompaniesWorked       1102 non-null   int64
dtypes: int64(10)
memory usage: 94.7 KB


  X_train['OverTime'] = X_train['OverTime'].replace({'Yes': 1, 'No': 0}).astype('int64')
  X_test['OverTime'] = X_test['OverTime'].replace({'Yes': 1, 'No': 0}).astype('int64')


In [33]:
# Create a StandardScaler
scaler = StandardScaler()


# Fit the StandardScaler to the training data
scaler.fit(X_train)


# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [34]:
# Create a OneHotEncoder for the Department column in the y_train dataset
# Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data

from sklearn.preprocessing import OneHotEncoder
OneHot_depart = OneHotEncoder(sparse_output=False)
department_train_enc = OneHot_depart.fit_transform(y_train[['Department']])
department_columns = OneHot_depart.get_feature_names_out(['Department'])
df_department_train_enc = pd.DataFrame(department_train_enc, columns=department_columns,)
department_test_enc = OneHot_depart.transform(y_test[['Department']])
department_columns = OneHot_depart.get_feature_names_out(['Department'])
df_department_test_enc = pd.DataFrame(department_test_enc, columns=department_columns)

print(df_department_train_enc.shape)
print(df_department_test_enc.shape)


(1102, 3)
(368, 3)


In [35]:
# Create a OneHotEncoder for the Attrition column
# Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data
OneHot_attrition = OneHotEncoder(sparse_output=False)
att_train_enc = OneHot_attrition.fit_transform(y_train[['Attrition']])
att_test_enc = OneHot_attrition.transform(y_test[['Attrition']])
print(att_train_enc.shape)
print(att_test_enc.shape)


(1102, 2)
(368, 2)


## Create, Compile, and Train the Model

In [36]:
# Find the number of columns in the X training data
# Create the input layer
input_layer = layers.Input(shape=(X_train_scaled.shape[1],))

# Create at least two shared layers
shared_layer1 = layers.Dense(units=64, activation='relu')(input_layer)
shared_layer2 = layers.Dense(units=128, activation='relu')(shared_layer1)

In [37]:
# Create a branch for Department
# with a hidden layer and an output layer
department_branch = layers.Dense(units=32, activation='relu', name='department_branch')(shared_layer2)
department_output = layers.Dense(units=3, activation='softmax', name='department_output')(department_branch)


In [38]:
# Create a branch for Attrition
# with a hidden layer and an output layer
attrition_branch = layers.Dense(units=32, activation='relu', name = 'attrition_branch')(shared_layer2)
attrition_output = layers.Dense(units=2, activation='sigmoid', name='attrition_output')(attrition_branch)

In [42]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output':'categorical_crossentropy', 'attrition_output':'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()


In [43]:
# Train the model
model.fit(X_train_scaled,
    {'department_output': df_department_train_enc, 'attrition_output': att_train_enc},
    epochs=10,
    batch_size=32,
    validation_data=(X_test_scaled, {'department_output': df_department_test_enc, 'attrition_output': att_test_enc}))

Epoch 1/10


ValueError: Arguments `target` and `output` must have the same shape. Received: target.shape=(None, 2), output.shape=(None, 3)

In [None]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled, {'department_output': department_test_enc, 'attrition_output': att_test_enc})
test_results


In [None]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {test_results[2]}")
print(f"Attrition Accuracy: {test_results[3]}")



# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?


> Accuracy may not be the best way to evaluate this model for predicting attrition because of the imbalanced values in the dataset. You could possibly use a precision, recall or F1 score to better evaluate the model.


---



2. What activation functions did you choose for your output layers, and why?


> I chose softmax for the Department class because it's a multi-class prediction. I chose sigmoid for the Attrition because it's a binary class.


---



3. Can you name a few ways that this model might be improved?

> Increasing the number of features or changing the features used in the X dataset may change the predictive value.