In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
import warnings
warnings.filterwarnings("ignore")

# Importing data which is in csv format

In [None]:
df=pd.read_csv('HR_Analytics.csv')
df.head()

# Start of EDA process

Checking shape of the Dataset

In [None]:
df.shape

Checking data types and null count of the columns

In [None]:
df.info()

Observation : There is no null or NAN value in the table

# Lets check how our output variable is distributed

In [None]:
sns.countplot(df['Attrition'])

We can see that our output is just Yes or No, so we should do bivariant classification analysis on our dataset

# Lets check how all our categorical variable containing variables are distributed

In [None]:
cold=['Attrition','BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']
    
for i in range(1,9):
    print(sns.countplot(df[cold[i]]))
    plt.show()

# We can visually see the contents in our categorical columns and how they are distributed

# Lets visualize all the same using histogram

In [None]:
df.hist(figsize=(15,20),color='g')
plt.show()

We can see that some of the columns are skewed like Yearsatcompany and yearsincurrentrole

# Lets visualize our continuous variables with our output variable

In [None]:
colc=['Age','DailyRate','DistanceFromHome','Education','EmployeeCount','EmployeeNumber','RelationshipSatisfaction','StandardHours','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']
     
for i in range(0,16):
    print(sns.violinplot(x="Attrition", y=df[colc[i]], data=df))
    plt.show()
    print(sns.stripplot(x="Attrition", y=df[colc[i]], data=df))
    plt.show()
    print(sns.swarmplot(x="Attrition", y=df[colc[i]], data=df))
    plt.show()

# Our obsersations based on the above Visualizations
We will concentrate only on Yes part of the output variable now

1.At age around 30 attrition is high
2.People getting daily rate between 250 to 500 have high number of attrition
3.People between 0 to 5 in Distancefromhome have maximum numbers of yes for attrition
4.Stockoptionlevel with 0 people have maximum number of leaving employees
5.Total working years between 8 to 10 also have maximum number of leavers
6.Same goes for Training times last year for 2
7.Work life balance with number 3 have also left majorly
8.People who have worked for 1-2 years has highest number of leavers as per YearsAtCompany column
9.Same point as above(8) goes for Years in current role as well
10.Years sicne last promotion seems to be high around 1


Lets convert all our categorical data to numerical

First lets convert our Target variable

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create a label encoder object
le = LabelEncoder()
le.fit(df['Attrition'])
y1=le.transform(df['Attrition'])
y1

In [None]:
y=pd.DataFrame(y1,columns =["Attrition"])
y.head()

In [None]:
x1=df[['BusinessTravel','Department','EducationField','Gender','JobRole','MaritalStatus','Over18','OverTime']]
x1=pd.get_dummies(x1)
x1.head()

In [None]:
x2=df[['Age','DailyRate','DistanceFromHome','Education','EmployeeCount','EmployeeNumber','RelationshipSatisfaction','StandardHours','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']]
x2.head()

In [None]:
df_final=pd.concat([x2,x1,y], axis=1)
df_final.head()

In [None]:
df_final.corr()

In [None]:
import numpy as np
corr = df_final.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(20, 20))
    ax = sns.heatmap(corr, mask=mask, vmax=.3, square=True, linewidths=0.5,cmap="YlGnBu")

In [None]:
df1=df_final.corr()
#since we only care about the correlation with our output, lets separate it
df2=df1.iloc[:,45:46]
df2

In [None]:
np.abs(df2)<0.1

In [None]:
# We can drop the columns that are weekly correlated, which are showing as True, i.e, DailyRate, DistanceFromHome,EmployeeNumber,RelationshipSatisfaction,TrainingTimesLastYear,WorkLifeBalance,YearsSinceLastPromotion,BusinessTravel_Non-Travel,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,MaritalStatus_Divorced,MaritalStatus_Married

In [None]:
df.head()

In [None]:
x=df[['Age','Education','EmployeeCount','StandardHours','StockOptionLevel','TotalWorkingYears','YearsAtCompany','YearsInCurrentRole','YearsWithCurrManager','BusinessTravel_Travel_Frequently','JobRole_Sales Representative','MaritalStatus_Single','Over18_Y','OverTime_No','OverTime_Yes']]
x.head()

In [None]:
y=df[['Attrition']]
y.head()

# Note: Outliers and skew test was not done as most of the attributes were converted from category

We have both of input and output attributes cleaned and in desired format

End of EDA Process
Lets start Building models to make predictions and find the model that works best on our dataset

Start of Machine Learning Process
Since out target variable is Bivariant, we are going to do classification analysis

Lets import required packages

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc

# Lets split our data randomly and see which model works better

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Lets import all the regression models

In [None]:
KNN=KNeighborsClassifier(n_neighbors=6)
SV=SVC()
LR=LogisticRegression()
DT=DecisionTreeClassifier(random_state=6)
GNB=GaussianNB()

In [None]:
models = []
models.append(('KNeighborsClassifier', KNN))
models.append(('SVC', SV))
models.append(('LogisticRegression', LR))
models.append(('DecisionTreeClassifier', DT))
models.append(('GaussianNB', GNB))

# Lets create a loop that will execute all our models

In [None]:
Model = []
score = []
cvs=[]
rocscore=[]
for name,model in models:
    print('*-----------------------------*',name,'*------------------------------*')
    print('\n')
    Model.append(name)
    model.fit(x_train,y_train)
    print(model)
    pre=model.predict(x_test)
    print('\n')
    AS=accuracy_score(y_test,pre)
    print('Accuracy_score = ',AS)
    score.append(AS*100)
    print('\n')
    sc = cross_val_score(model, x, y, cv=10, scoring='accuracy').mean()
    print('Cross_Val_Score = ',sc)
    cvs.append(sc*100)
    print('\n')
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,pre)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print ('roc_auc_score = ',roc_auc)
    rocscore.append(roc_auc*100)
    print('\n')
    print('classification_report\n',classification_report(y_test,pre))
    print('\n')
    cm=confusion_matrix(y_test,pre)
    print(cm)
    print('\n')
    plt.figure(figsize=(10,40))
    plt.subplot(911)
    plt.title(name)
    print(sns.heatmap(cm,annot=True))
    plt.subplot(912)
    plt.title(name)
    plt.plot(false_positive_rate, true_positive_rate, label='AUC = %0.2f'% roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.legend(loc='lower right')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    print('\n\n')

In [None]:
Final_R = pd.DataFrame({'Classification Model': Model, 'Accuracy Score': score ,'Cross_val_score':cvs,'Roc_auc_curve':rocscore})
Final_R

Logistic Regression seems to be working better with better Cross_val_score and Roc_auc_curve, so lets try to make this better and see if we can increase its accuracy more

# Lets first find best random state for Logistic Regression to split our data into train and test data

In [None]:
max_accuracy=0
for r_state in range(42,100):
    x_train, x_test, y_train, y_test = train_test_split(x,y,random_state = r_state,test_size=0.20)
    lr=LogisticRegression()
    lr.fit(x_train,y_train)
    y_pred = lr.predict(x_test)
    accuracy=accuracy_score(y_test,y_pred)
    if accuracy>max_accuracy:
        max_accuracy=accuracy
        final_r_state=r_state
print("Maximum Accuracy is achived for random state ",final_r_state," with a score of",max_accuracy)

# Lets split our dataset based on above random state

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=55)

# Lets find best hyperparameters for Logistic Regression using grid search

In [None]:
from sklearn.model_selection import GridSearchCV
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
lr=LogisticRegression()
lr_cv=GridSearchCV(lr,grid,cv=10)
lr_cv.fit(x_train,y_train)

print("tuned hpyerparameters :(best parameters) ",lr_cv.best_params_)
print("accuracy :",lr_cv.best_score_)

# Buliding Final Model

In [None]:
lr_f=LogisticRegression(penalty='l2',C=0.1)
lr_f.fit(x_train,y_train)
pre=lr_f.predict(x_test)
accuracy=accuracy_score(y_test,pre)
cm=sns.heatmap(confusion_matrix(y_test,pre),annot=True)
print(classification_report(y_test, pre))
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
roc_auc = roc_auc_score(y_test, pre)
fpr, tpr, thresholds = roc_curve(y_test, lr_f.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
print('Accuracy of the model is',accuracy*100)

In [None]:
# We have achived accuracy of 88.4%

# End of Machine learning Process

# Now lets save our final model

In [None]:
import joblib
joblib.dump(lr_f,'HR_Analytics.pkl')