# Human Resources Attrition and Performance Analytics

## 1. Importing necessary libraries

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score

## 2. Importing data

In [None]:
url = 'https://raw.githubusercontent.com/m-bermudez/ISM6251-ML-Project/refs/heads/main/EmployeeAttrition.csv'
df = pd.read_csv(url)
df.head()

## 3. Re-factoring the data

In [None]:
type_map = {
    'object': 'category'
}
for column in df.columns:
    original_data_type = df[column].dtype.name
    if original_data_type in type_map:
        df[column] = df[column].astype(type_map[original_data_type])

## 4. Data Exploration

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

### Data Visualization

In [None]:
sns.set_palette("viridis")

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Gender', hue='Attrition')
plt.title('Attrition by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='MaritalStatus', hue='Attrition')
plt.title('Attrition by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='BusinessTravel', hue='Attrition')
plt.title('Attrition by Business Travel')
plt.xlabel('Business Travel')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Department', hue='Attrition')
plt.title('Attrition by Department')
plt.xlabel('Department')
plt.ylabel('Count')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='Attrition', y='Age', palette='viridis')
plt.title('Age Distribution by Attrition')
plt.xlabel('Attrition')
plt.ylabel('Age')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.violinplot(data=df, x='Attrition', y='MonthlyIncome', palette='viridis')
plt.title('Monthly Income Distribution by Attrition')
plt.xlabel('Attrition')
plt.ylabel('Monthly Income')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.swarmplot(data=df, x='Attrition', y='DistanceFromHome', palette='viridis')
plt.title('Distance from Home by Attrition')
plt.xlabel('Attrition')
plt.ylabel('Distance from Home')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.stripplot(data=df, x='Attrition', y='YearsAtCompany', palette='viridis', jitter=True)
plt.title('Years at Company by Attrition')
plt.xlabel('Attrition')
plt.ylabel('Years at Company')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(data=df, x='Attrition', y='JobSatisfaction', palette='viridis', ci=None)
plt.title('Average Job Satisfaction by Attrition')
plt.xlabel('Attrition')
plt.ylabel('Average Job Satisfaction')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='Attrition', y='TotalWorkingYears', palette='viridis')
plt.title('Total Working Years by Attrition')
plt.xlabel('Attrition')
plt.ylabel('Total Working Years')
plt.show()

## 5. Encoding Data

In [None]:
categorical_cols = df.select_dtypes(include=['category']).columns
label_encoders = {}

print("Encoding Guide:")
print("-" * 50)

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

    mapping = {index: label for index, label in enumerate(le.classes_)}
    print(f"Column: {col}")
    for original, encoded in mapping.items():
        print(f"  {encoded} -> {original}")
    print("-" * 50)

In [None]:
df.drop(columns=['EmployeeNumber', 'Over18'], inplace=True)

In [None]:
plt.figure(figsize=(20, 20))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="RdBu")
plt.title('Correlation Matrix')
plt.show()

### Data Normalization

In [None]:
scaler = StandardScaler()
num_cols = df.select_dtypes(include=['int64']).columns
df[num_cols] = scaler.fit_transform(df[num_cols])

### Train-Split Data

In [None]:
X = df.drop('Attrition', axis=1)
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Importance - Regression Model

In [None]:
plt.figure(figsize=(15, 10))
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_

importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)
sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance')
plt.show()

In [None]:
threshold = 0.01

important_features = importance_df[importance_df['Importance'] > threshold]['Feature']
print(f"Selected features: {important_features.tolist()}")

X_train_reduced = X_train[important_features]
X_test_reduced = X_test[important_features]

print(f"Original feature count: {X_train.shape[1]}")
print(f"Reduced feature count: {X_train_reduced.shape[1]}")

In [None]:
plt.figure(figsize=(15, 10))
sns.barplot(
    data=importance_df[importance_df['Importance'] > threshold],
    x='Importance',
    y='Feature',
    palette='viridis'
)
plt.title('Remaining Feature Importances After Threshold Selection')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
threshold = y_train.median()
y_train_class = np.where(y_train > threshold, 1, 0)
y_test_class = np.where(y_test > threshold, 1, 0)

models = {
    'Decision Tree': DecisionTreeClassifier(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train_class)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test_class, y_pred)
    results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"{name} Classification Report:")
    print(classification_report(y_test_class, y_pred))
    print(f"{name} Confusion Matrix:")
    print(confusion_matrix(y_test_class, y_pred))
    print()

#This cell is the classifier cell :)

### Results Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Accuracy', data=results_df, palette='viridis')
plt.title('Model Accuracy Comparison')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score


precision_results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    precision = precision_score(y_test_class, y_pred)
    precision_results[name] = precision

precision_df = pd.DataFrame(list(precision_results.items()), columns=['Model', 'Precision'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Precision', data=precision_df, palette='viridis')
plt.title('Model Precision Comparison')
plt.xlabel('Model')
plt.ylabel('Precision')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import recall_score

recall_results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    recall = recall_score(y_test_class, y_pred)
    recall_results[name] = recall

recall_df = pd.DataFrame(list(recall_results.items()), columns=['Model', 'Recall'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Recall', data=recall_df, palette='viridis')
plt.title('Model Recall Comparison')
plt.xlabel('Model')
plt.ylabel('Recall')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
f1_results = {}

for name, model in models.items():
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test_class, y_pred)
    f1_results[name] = f1

f1_df = pd.DataFrame(list(f1_results.items()), columns=['Model', 'F1-Score'])

plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='F1-Score', data=f1_df, palette='viridis')
plt.title('Model F1-Score Comparison')
plt.xlabel('Model')
plt.ylabel('F1-Score')
plt.ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Recommended Model: Gradient Boosting
### Reasoning:

Our business problem revolves around identifying employees likely to leave (Class 1), Gradient Boosting offers the following:

**Precision (71%):**  Among the highest compared to other models, meaning it reduces false positives.

**Recall (31%):** While still low, it is higher than Decision Tree (26%), KNN (3%), and Random Forest (8%).

**F1-Score (43%):** Indicates a better balance between precision and recall for Class 1 than the other models.

**Accuracy(89%):**
Gradient Boosting achieves the highest overall accuracy at 89%, outperforming all other models in predicting employee attrition.

**Confusion Matrix:**

Gradient Boosting correctly predicts the majority of Class 0 (non-attrition) cases (250/255) while identifying more Class 1 cases (12/39) compared to other models.

###Comparison with Other Models:

**Decision Tree:**
Performs decently overall but has poor recall for Class 1 (26%) and a lower F1-score (25%), indicating it's less reliable for attrition predictions.

**KNN:** High overall accuracy (86%) but extremely poor recall for Class 1 (3%), making it almost useless for identifying employees likely to leave.

**Random Forest:**
Similar accuracy to Gradient Boosting (87%) but fails to identify Class 1 effectively, with a low recall (8%) and F1-score (14%).

Conclusion:
Gradient Boosting is recommended due to its:
Highest accuracy.
Balanced precision and recall for attrition cases.
Stronger overall performance in predicting employee retention outcomes.

### Business Impact:

The model with the best balance between identifying true attrition cases and minimizing false positives/negatives is crucial for designing effective retention strategies. Gradient Boosting strikes this balance better than the others. It does so through having the highest accuracy, a balanced precision and recall and stronger overall performance in predicting employee retention outcomes.