# Employee Attrition Prediction System

## Machine Learning Model

In [1]:
import ipywidgets as widgets
import io

# Create a file upload widget
training_data_upload_widget = widgets.FileUpload(
    accept='.csv',  # Restrict to .csv files
    multiple=False,  # Allow only one file to be uploaded
    description='Upload CSV',
    layout=widgets.Layout(width='auto', height='auto')
)

# Display the file upload widget
display(training_data_upload_widget)


def process_uploaded_file(datatype):
    if datatype == 'training':
        upload_widget = training_data_upload_widget
    elif datatype == 'prediction':
        upload_widget = prediction_data_upload_widget
        
    if upload_widget.value:
        # Extract the first (and only) item from the tuple
        file_info = upload_widget.value[0]
        
        # Extract the file content from the file_info dictionary
        content = file_info['content']
        df = pd.read_csv(io.BytesIO(content))
        return df

FileUpload(value=(), accept='.csv', description='Upload CSV')

In [7]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier


# Probability threshold for classifying employee as attrition risk
# Raising this number will increase precision but reduce recall
# Lowering this number will reduce precision but increase recall
# 0.3 results in the best balance between precision and recall (F1 score) in this model
CLASSIFICATION_THRESHOLD = 0.3


# Features to drop from dataset
DROP_COLUMNS = ['Over18', 'EmployeeNumber', 'StandardHours', 'EmployeeCount']

class AttritionModel:
   
    def __init__(self):
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.confusion_matrices = {}
        self.preprocessor = None
        self.models = self.initialize_models()

    
    # Initialize models with predefined hyperparameters chosen based on prior tuning
    # This is based on hyperparamter tuning that was conducted using GridSearchCV
    def initialize_models(self):
        return {
            'Logistic Regression': LogisticRegression(C=0.01, class_weight='balanced', max_iter=5000, penalty='l2', solver='liblinear', random_state=42),
            'Gradient Boosting': GradientBoostingClassifier(learning_rate=0.2, max_depth=3, n_estimators=200, random_state=42),
            'Random Forest': RandomForestClassifier(class_weight=None, max_depth=15, max_features=None, n_estimators=200, random_state=42),
            'Neural Network': MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 100), solver='lbfgs', random_state=42),
            'Ensemble': VotingClassifier(estimators=[
                ('lr', LogisticRegression(C=0.01, class_weight='balanced', max_iter=5000, penalty='l2', solver='liblinear', random_state=42)),
                ('gb', GradientBoostingClassifier(learning_rate=0.2, max_depth=3, n_estimators=200, random_state=42)),
                ('rf', RandomForestClassifier(class_weight=None, max_depth=15, max_features=None, n_estimators=200, random_state=42)),
                ('nn', MLPClassifier(activation='logistic', hidden_layer_sizes=(100, 100), solver='lbfgs', random_state=42))],
                voting='soft')
        }

    
    def initialize_preprocessor(self, data):
        if self.preprocessor:
            return
        else:
            # Defining numerical and categorical columns
            numerical_cols = self.X_train.select_dtypes(include=['int64', 'float64']).columns
            categorical_cols = self.X_train.select_dtypes(include=['object']).columns
    
            # Creating preprocessing pipelines for both numeric and categorical data
            self.preprocessor = ColumnTransformer(
                transformers=[
                    ('num', Pipeline(steps=[('scaler', StandardScaler())]), numerical_cols),
                    ('cat', Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_cols)
                ],
                remainder='passthrough'
            )


    def preprocess_training_data(self, data):
        # Separating features and target variable and dropping non-relevant columns
        drop_columns_copy = DROP_COLUMNS.copy()
        drop_columns_copy.append('Attrition')
        X = data.drop((drop_columns_copy), axis=1)
        y = data['Attrition'].map({'Yes': 1, 'No': 0})

        # Splitting data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        self.initialize_preprocessor(self.X_train)
        
        # Fit the preprocessor with training data
        self.preprocessor.fit(self.X_train)

        # Transform the training and testing data
        self.X_train = self.preprocessor.transform(self.X_train)
        self.X_test = self.preprocessor.transform(self.X_test)

    
    def preprocess_prediction_data(self, data):
        data = data.drop((DROP_COLUMNS), axis=1)
        preprocessed_data = self.preprocessor.transform(data)
        return preprocessed_data       

           
    def train_and_evaluate_model(self, name, model, print_metrics=False):
        model.fit(self.X_train, self.y_train)
        y_prob = model.predict_proba(self.X_test)[:, 1]
        predictions = (y_prob >= CLASSIFICATION_THRESHOLD).astype(int)
        
        accuracy = accuracy_score(self.y_test, predictions)
        precision = precision_score(self.y_test, predictions)
        recall = recall_score(self.y_test, predictions)
        f1 = f1_score(self.y_test, predictions)
        roc_auc = roc_auc_score(self.y_test, y_prob)

        if print_metrics:
            print(f'{name} Model Evaluation:')
            print(f'Accuracy: {accuracy:.4f}')
            print(f'Precision: {precision:.4f}')
            print(f'Recall: {recall:.4f}')
            print(f'F1 Score: {f1:.4f}')
            print(f'ROC AUC: {roc_auc:.4f}')

        self.confusion_matrices[name] = confusion_matrix(self.y_test, predictions)        
    
    
    # Evaluate each model on test data
    def train_and_evaluate_all_models(self):
        print('\nTraining and Evaluating models...\n')
        for name, model in self.models.items():
            self.train_and_evaluate_model(name, model, print_metrics=True)
            print()

        
    def train_model(self):
        data = process_uploaded_file(datatype='training')
        # data = pd.read_csv('data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
        self.preprocess_training_data(data)
        self.train_and_evaluate_all_models()

    
    def generate_predictions(self):
        print('Generating Predictions...\n')
        prediction_data = process_uploaded_file(datatype='prediction')
        employee_numbers = prediction_data['EmployeeNumber']
        preprocessed_data = self.preprocess_prediction_data(prediction_data)
        model = self.models['Ensemble']
        y_prob = model.predict_proba(preprocessed_data)[:, 1]
        predictions = (y_prob >= CLASSIFICATION_THRESHOLD).astype(int)
        predictions = ['Yes' if pred == 1 else 'No' for pred in predictions]
        for emp_num, pred in zip(employee_numbers, predictions):
            print(f'Employee Number: {emp_num}, Attrition Risk: {pred}')       
        

# Instantiate new AttritionModel object
attrition_model = AttritionModel()           

In [None]:

from IPython.display import display, clear_output

# Create a button widget for running the model
train_model_button = widgets.Button(
    description="Train Model", 
    disabled=True,
    layout=widgets.Layout(width='auto', height='auto')
)


# Output widget for the model
training_output = widgets.Output()


# Callback function for the run model button
def on_train_model_button_clicked(b):
    with training_output:
        clear_output(wait=True)
        attrition_model.train_model() 
        print("\n\nModel training and evaluation completed.")
    prediction_data_upload_widget.disabled=False


# Link the button to the callback function
train_model_button.on_click(on_train_model_button_clicked)
# Display the button and the output widget for the model
display(train_model_button, training_output)

## Visualizations

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns



def generate_visualizations():
    data = process_uploaded_file(datatype='training')
    
    # Correlation Graph
    # Preparing the data for the correlation graph
    data_filtered = data.drop(columns=['EmployeeNumber', 'StandardHours', 'EmployeeCount'])
    data_filtered['Attrition'] = data_filtered['Attrition'].map({'Yes': 1, 'No': 0})
    data_filtered['BusinessTravel'] = data_filtered['BusinessTravel'].map({'Non_Travel': 1, 'Travel_Rarely': 2, 'Travel_Frequently': 3})
    attrition_correlations = data_filtered.corr(numeric_only=True)['Attrition'].sort_values()

    
    # Excluding the correlation of 'Attrition' with itself
    attrition_correlations = attrition_correlations[attrition_correlations.index != 'Attrition']
    
    plt.figure(figsize=(12, 12))
    sns.barplot(x=attrition_correlations.index, y=attrition_correlations.values, hue=attrition_correlations.index, palette='vlag', legend=False)
    plt.title('Correlation of Features with Attrition')
    plt.ylabel('Correlation Coefficient with Attrition')
    plt.xlabel('Features')
    plt.xticks(rotation=90)  # Rotate x-axis labels for better visibility
    plt.show()
    print()
    
    
    # Income Box and Whisker Graph
    plt.figure(figsize=(12, 12))
    sns.boxplot(x='JobRole', y='MonthlyIncome', hue='Attrition', data=data)
    plt.title('Monthly Income Distribution by Job Role and Attrition Status')
    plt.xlabel('Job Role')
    plt.xticks(rotation=90)
    plt.ylabel('Monthly Income')
    plt.legend(title='Attrition')
    plt.show()
    print()
    
    
    # Confusion Matrix
    def plot_confusion_matrix(cm, model_name):
        plt.figure(figsize=(12, 12))
        class_labels = ['No', 'Yes']
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels, cbar=False)
        plt.title(f'Confusion Matrix for {model_name} Model')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        plt.show()
    
    
    #plot_confusion_matrix(attrition_model.confusion_matrices['Ensemble'], 'Ensemble')
    for name, cm in attrition_model.confusion_matrices.items():
        if name == 'Ensemble':  # Only create confusion matrix for Ensemble model
            plot_confusion_matrix(cm, name)


In [None]:
# Create a button widget for generating visualizations
visualize_button = widgets.Button(
    description="Generate Visualizations", 
    disabled=True,
    layout=widgets.Layout(width='auto', height='auto')
)

# Output widget for visualizations
visualization_output = widgets.Output()

# Callback function for the visualization button
def on_visualize_button_clicked(b):
    with visualization_output:
        clear_output(wait=True)
        generate_visualizations()


# Link the button to the callback function
visualize_button.on_click(on_visualize_button_clicked)
# Display the button and the output widget for visualizations
display(visualize_button, visualization_output)

## Predictions

In [None]:
# Create a file upload widget
prediction_data_upload_widget = widgets.FileUpload(
    accept='.csv',  # Restrict to .csv files
    multiple=False,  # Allow only one file to be uploaded
    description='Upload CSV',
    disabled=True,
    layout=widgets.Layout(width='auto', height='auto')
)

# Display the file upload widget
display(prediction_data_upload_widget)


# Create a button widget for running the predictions
generate_predictions_button = widgets.Button(
    description="Generate Predictions", 
    disabled=True,
    layout=widgets.Layout(width='auto', height='auto')
)


# Output widget for the model
predictions_output = widgets.Output()


# Callback function for the run prediction button
def on_generate_predictions_button_clicked(b):
    with predictions_output:
        clear_output(wait=True)
        attrition_model.generate_predictions() 


# Link the button to the callback function
generate_predictions_button.on_click(on_generate_predictions_button_clicked)
# Display the button and the output widget for the model
display(generate_predictions_button, predictions_output)

In [None]:
# Function to enable buttons if a file is uploaded
def enable_buttons(change):
    if training_data_upload_widget.value:
        train_model_button.disabled = False
        visualize_button.disabled = False
    else:
        train_model_button.disabled = True
        visualize_button.disabled = True
    if prediction_data_upload_widget.value:
        generate_predictions_button.disabled = False
    else:
        generate_predictions_button.disabled = True


# Attach the observer function to the file upload widget
training_data_upload_widget.observe(enable_buttons, names='value')
prediction_data_upload_widget.observe(enable_buttons, names='value')