In [64]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
hr = pd.read_csv('Employee_Data.csv')
col_names = hr.columns.tolist()
print("Column names:")
print(col_names)
print("\nSample data:")
hr.head()

In [None]:
hr.info()

In [67]:
hr.shape

(11991, 10)

In [None]:
hr.isnull().sum()

In [None]:
hr.duplicated().sum()

##**Checking Positive and Negative Instances**

In [None]:
positive_instances = hr[hr['left'] == 1].shape[0]
negative_instances = hr[hr['left'] == 0].shape[0]

total_instances = hr.shape[0]

positive_percentage = (positive_instances / total_instances) * 100
negative_percentage = (negative_instances / total_instances) * 100

print("Percentage of positive instances (employees who have left): {:.2f}%".format(positive_percentage))
print("Percentage of negative instances (employees who have not left): {:.2f}%".format(negative_percentage))

In [None]:
hr['departments'].unique()

In [72]:
import numpy as np
hr['departments']=np.where(hr['departments'] =='support', 'technical', hr['departments'])
hr['departments']=np.where(hr['departments'] =='IT', 'technical', hr['departments'])

In [73]:
cat_vars=['departments','salary']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(hr[var], prefix=var)
    hr1=hr.join(cat_list)
    hr=hr1

In [None]:
hr.drop(hr.columns[[8, 9]], axis=1, inplace=True)
hr.columns.values

In [75]:
hr_vars=hr.columns.values.tolist()
y=['left']
X=[i for i in hr_vars if i not in y]

## **Feature Selection Process**

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rfe_rf = RFE(estimator=rf_model, n_features_to_select=10)
rfe_rf = rfe_rf.fit(hr[X], hr[y])

print("Selected features using Random Forest:")
print(rfe_rf.support_)
print("Feature ranking:")
print(rfe_rf.ranking_)

In [None]:
X_df = hr[X]
selected_columns = X_df.columns[rfe_rf.support_]
print(selected_columns)

In [78]:
cols=['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident',
       'promotion_last_5years', 'departments_sales', 'departments_technical',
       'salary_high']
X=hr[cols]
y=hr['left']

### **Logistic Regression**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, logreg.predict(X_test))))

## **Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))

## **Support Vector Classifier**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
svm = SVC()
svm.fit(X_train, y_train)

In [None]:
print('Support Vector Machine Accuracy: {:.3f}'.format(accuracy_score(y_test,svm.predict(X_test))))

## **Classification Report and Confusion Matrix for Logistic Regression**

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, logreg.predict(X_test)))


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
logreg_y_pred = logreg.predict(X_test)
logreg_cm = metrics.confusion_matrix(logreg_y_pred, y_test)
sns.heatmap(logreg_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Logistic Regression')

## **Classification Report and Confusion Matrix for Random Forest**

In [None]:
print(classification_report(y_test, rf.predict(X_test)))

In [None]:
y_pred = rf.predict(X_test)
forest_cm = metrics.confusion_matrix(y_pred, y_test)
sns.heatmap(forest_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] )
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Random Forest')

## **Classification Report and Confusion Matrix for Support Vector Classifier**

In [None]:
print(classification_report(y_test,svm.predict(X_test)))

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

y_pred_svm = svm.predict(X_test)
svm_cm = confusion_matrix(y_pred_svm,y_test)
sns.heatmap(svm_cm, annot=True, fmt='.2f', xticklabels=["Left", "Stayed"], yticklabels=["Left", "Stayed"])
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.title('Support Vector Machine Confusion Matrix')
plt.show()

# **ROC Curve**

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.svm import SVC
import matplotlib.pyplot as plt

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])

rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test))
rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:,1])

svm = SVC(probability=True)
svm.fit(X_test, y_test)

svm_roc_auc = roc_auc_score(y_test, svm.predict(X_test))
svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm.predict_proba(X_test)[:,1])

plt.figure(figsize=(13,8))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot(svm_fpr, svm_tpr, label='Support Vector Machine (area = %0.2f)' % svm_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# **Feature Importance**

In [None]:
feature_labels = np.array(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident',
       'promotion_last_5years', 'departments_sales', 'departments_technical',
       'salary_high'])
importance = rf.feature_importances_
feature_indexes_by_importance = importance.argsort()
for index in feature_indexes_by_importance:
    print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))

# **Feature Importance Visualization**

In [None]:
import matplotlib.pyplot as plt

categories = feature_labels
percentages = importance * 100.0

sorted_data = sorted(zip(categories, percentages), key=lambda x: x[1])

sorted_categories, sorted_percentages = zip(*sorted_data)

plt.figure(figsize=(13, 8))
bars = plt.barh(sorted_categories, sorted_percentages, color='orange')
plt.xlabel('Percentage (%)')
plt.ylabel('Features')
plt.title('Percentage Contribution of Features')

for bar, percentage in zip(bars, sorted_percentages):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height() / 2,
             '{:.2f}%'.format(percentage),
             va='center')
plt.show()