## Part 1: Preprocessing

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df

In [None]:
# Determine the number of unique values in each column.
attrition_df.nunique()

In [None]:
# Create y_df with the Attrition and Department columns
y_df_dept = attrition_df[['Department']]
y_df_attr = attrition_df[['Attrition']]
#y_df_dept.head()
#y_df_attr.head()

In [None]:
# Create a list of at least 10 column names to use as X data
# Create X_df using your selected columns

X_df = attrition_df[['Age','Education','EducationField','EnvironmentSatisfaction','HourlyRate', 'JobInvolvement',
                'JobSatisfaction', 'MaritalStatus','NumCompaniesWorked','PercentSalaryHike','PerformanceRating',  
                'StockOptionLevel','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager' ]]

# Show the data types for X_df

X_df.info()
X_df.reset_index(drop=True, inplace=True)

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_dept, y_test_dept,y_train_attr,y_test_attr = train_test_split(X_df, y_df_dept, y_df_attr)
#X_train.head()
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

In [None]:
department_counts = y_train_dept.groupby('Department').size()
department_counts

In [None]:
attr_counts = y_train_attr.groupby('Attrition').size()
attr_counts

In [None]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Preprocess "EducationField" column (one-hot encoding for multiple categories)
eduf_encoder = OneHotEncoder(sparse_output=False)
eduf_encoded = eduf_encoder.fit_transform(X_train[['EducationField']])
eduf_columns = eduf_encoder.get_feature_names_out(['EducationField'])

X_train_eduf_encoded = pd.DataFrame(eduf_encoded, columns=eduf_columns)

X_test_eduf = eduf_encoder.transform(X_test[['EducationField']])

X_test_eduf_encoded = pd.DataFrame(X_test_eduf, columns=eduf_columns)


# Preprocess "MaritalStatus" column (label encoding for binary)
mar_encoder = LabelEncoder()
X_train['marital_status_encoded'] = mar_encoder.fit_transform(X_train['MaritalStatus'])
X_test['marital_status_encoded'] = mar_encoder.transform(X_test['MaritalStatus'])

# Concatenate the encoded columns to the original DataFrame
X_train_processed = pd.concat([X_train, X_train_eduf_encoded], axis=1)
X_test_processed = pd.concat([X_test, X_test_eduf_encoded], axis=1)

# Drop the original "EducationField" and "MaritalStatus" columns
X_train_processed = X_train_processed.drop(['EducationField', 'MaritalStatus'], axis=1)
X_test_processed = X_test_processed.drop(['EducationField', 'MaritalStatus'], axis=1)


X_train_processed.head(100)

In [None]:
# Create a StandardScaler
# Fit the StandardScaler to the training data
scaler = StandardScaler().fit(X_train_processed)
# Scale the training and testing data
X_train_scaled = scaler.transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

In [None]:
# Create a OneHotEncoder for the Department column
from sklearn.preprocessing import OneHotEncoder

# Create two new variables by applying the encoder
# to the training and testing data
y_train_dept.reset_index(drop=True, inplace=True)
y_test_dept.reset_index(drop=True, inplace=True)

dept_encoder = OneHotEncoder(sparse_output=False)
dept_encoded = dept_encoder.fit_transform(y_train_dept[['Department']])
dept_columns = dept_encoder.get_feature_names_out(['Department'])

y_train_dept_encoded = pd.DataFrame(dept_encoded, columns=dept_columns)

y_test_depte = dept_encoder.transform(y_test_dept[['Department']])

y_test_dept_encoded = pd.DataFrame(y_test_depte, columns=dept_columns)

# Concatenate the encoded columns to the original DataFrame
y_train_dept_processed = pd.concat([y_train_dept, y_train_dept_encoded], axis=1)
y_test_dept_processed = pd.concat([y_test_dept, y_test_dept_encoded], axis=1)

# Drop the origienal "Department" column
y_train_dept_processed = y_train_dept_processed.drop(['Department'], axis=1)
y_test_dept_processed = y_test_dept_processed.drop(['Department'], axis=1)
y_train_dept_processed.head()

In [None]:
# Create a OneHotEncoder for the Attrition column
#Fit the encoder to the training data
# Create two new variables by applying the encoder
# to the training and testing data
y_train_attr.reset_index(drop=True, inplace=True)
y_test_attr.reset_index(drop=True, inplace=True)

attr_encoder = OneHotEncoder(sparse_output=False)
attr_encoded = attr_encoder.fit_transform(y_train_attr[['Attrition']])
attr_columns = attr_encoder.get_feature_names_out(['Attrition'])

y_train_attr_encoded = pd.DataFrame(attr_encoded, columns=attr_columns)

y_test_attre = attr_encoder.transform(y_test_attr[['Attrition']])

y_test_attr_encoded = pd.DataFrame(y_test_attre, columns=attr_columns)

# Concatenate the encoded columns to the original DataFrame
y_train_attr_processed_fin = pd.concat([y_train_attr, y_train_attr_encoded], axis=1)
y_test_attr_processed_fin = pd.concat([y_test_attr, y_test_attr_encoded], axis=1)

# Drop the original "Department" column
y_train_attr_processed_fin = y_train_attr_processed_fin.drop(['Attrition'], axis=1)
y_test_attr_processed_fin = y_test_attr_processed_fin.drop(['Attrition'], axis=1)
y_train_attr_processed_fin.head()

## Create, Compile, and Train the Model

In [None]:
# Find the number of columns in the X training data
num_col_x= X_train_scaled.shape[1]
print(num_col_x)
# Create the input layer
input_layer = layers.Input(shape= (X_train_scaled.shape[1],), name='input_layer')

# Create at least two shared layers
shared_layer1 = layers.Dense(64, activation='relu', name = 'shared1')(input_layer)
shared_layer2 = layers.Dense(32, activation='relu',name= 'shared2')(shared_layer1)

In [None]:
# Create a branch for Department
# with a hidden layer and an output layer
department_dense = layers.Dense(64, activation='relu', name='department_hidden')(shared_layer2)
department_output = layers.Dense(3,
                             activation='softmax',
                             name='department_output')(department_dense)

In [None]:
# Create a branch for Attrition
# with a hidden layer and an output layer
# Create the hidden layer
# Create the output layer
attrition_dense = layers.Dense(64, activation='relu',name='attrition_hidden')(shared_layer2)
attrition_output = layers.Dense(2,
                             activation='sigmoid',
                             name='attrition_output')(attrition_dense)

In [None]:
# Create the model
model = Model(inputs=input_layer, outputs=[
    department_output,
    attrition_output
])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy',
                    'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy',
                       'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

In [None]:
# Train the model
model.fit(
    X_train_scaled,
    {'department_output': y_train_dept_processed, 'attrition_output':y_train_attr_processed_fin},
    epochs=20,
    batch_size=32,
    validation_split=0.2
)


In [None]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled, {'department_output': y_test_dept_processed, 'attrition_output': y_test_attr_processed_fin})
test_results

In [None]:
# Print the accuracy for both department and attrition
print(f"Department Accuracy: {test_results[1]}")
print(f"Attrition Accuracy: {test_results[2]}")

#Attrition Accuracy: 0.883152186870575
#Department Accuracy: 0.7853260636329651

# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. While accuracy gives us a directional view of the mertic it is not the best indicator due to the skews present that can cause a class imbalance in Department and Attrition data. Based on the distributions seen, Department is showing a skew to R&D
Department
Human Resources            47
Research & Development    713
Sales                     342
Similarly attrition has a higher number of Nos compared to Yes:
Attrition
No     924
Yes    178

For both the above attributes, the accuarcy score may only be valid for the majority class and not the minority class.


2. The activation function I used for the department output layer is 'softmax'. Softmax is the activation function of choice for the output layer of a model when dealing with multi-class classification, where each input belongs to one and only one class out of several possible classes. This was the ideal choice as the classes for both were also mutually exclusive. For the 'Attrition' output layer I used sigmoid with binary cross entropy as it is predicting a 'Yes', 'No'.

4. the model can be further improved by treating the data imbalance via under sampling. This could lead to a more balanced input and impact accuracy positively. testing with additional dense layers can also improve accuracy.Hyper parameter tuning is also a process to help increase acciracy.
