In [10]:
import numpy as np
import pandas as pd
from sklearn import tree, preprocessing
from sklearn.ensemble import RandomForestClassifier

In [3]:
data = pd.read_csv('Dataset/general_data.csv')
data.head(4)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5


In [4]:
data.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [5]:
#as there are 2 features having null values will replacing them with the corresponding mean of the features

data['NumCompaniesWorked'].mean()

2.6948303347756775

In [6]:
data['TotalWorkingYears'].mean()

11.279936378095888

In [7]:
num_companies_worked = np.where(data['NumCompaniesWorked'].isnull(),3,data['NumCompaniesWorked'])
num_of_working_years = np.where(data['TotalWorkingYears'].isnull(),11,data['TotalWorkingYears'])

In [8]:
data['NumCompaniesWorked'] = num_companies_worked
data['TotalWorkingYears'] = num_of_working_years

In [21]:
label_encoder = preprocessing.LabelEncoder()
encoded_data = data.apply(label_encoder.fit_transform)
encoded_data.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,33,0,2,2,5,1,1,0,0,0,...,1,0,0,0,0,1,6,1,0,0
1,13,1,1,1,9,0,1,0,1,0,...,0,0,12,0,1,6,3,5,1,4


In [22]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2, oob_score=True)

In [23]:
features =["Age","BusinessTravel", "Department", "DistanceFromHome", "Education", "EducationField", "Gender", "JobLevel", "JobRole",  "MaritalStatus", "MonthlyIncome", "NumCompaniesWorked",  "PercentSalaryHike", "StockOptionLevel",  "TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany","YearsSinceLastPromotion", "YearsWithCurrManager"]

In [24]:
rf_model.fit(X=encoded_data[features],y=encoded_data["Attrition"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
print("OOB Accuracy", rf_model.oob_score_)

OOB Accuracy 1.0


In [27]:
for feature,imp in zip(features,rf_model.feature_importances_):
    print(feature,imp)

Age 0.09660325028055937
BusinessTravel 0.028309047548824923
Department 0.025937759900842308
DistanceFromHome 0.06933245102778172
Education 0.040519508992163764
EducationField 0.0414323476785801
Gender 0.0183329130831887
JobLevel 0.03734244804064615
JobRole 0.05646470910480417
MaritalStatus 0.0396249510022188
MonthlyIncome 0.09519070657402091
NumCompaniesWorked 0.056653951786632684
PercentSalaryHike 0.06569359096690787
StockOptionLevel 0.03451187013333081
TotalWorkingYears 0.08557949409137373
TrainingTimesLastYear 0.04462404878302933
YearsAtCompany 0.06706280528781117
YearsSinceLastPromotion 0.043333721875103835
YearsWithCurrManager 0.05345042384217958


In [28]:
tree_model = tree.DecisionTreeClassifier(max_depth = 8)

In [29]:

predictors =pd.DataFrame([encoded_data['Age'],encoded_data['MonthlyIncome'],encoded_data['TotalWorkingYears'],encoded_data['DistanceFromHome'],encoded_data['YearsAtCompany'],encoded_data['PercentSalaryHike'],encoded_data['NumCompaniesWorked'],encoded_data['JobRole'],encoded_data['YearsWithCurrManager']]).T

In [30]:
tree_model.fit(X=predictors,y=encoded_data['Attrition'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [31]:
with open("Attrition.dot","w") as f:
    f = tree.export_graphviz(tree_model,feature_names=["Age","MonthlyIncome","TotalWorkingYears","DistanceFromHome","YearsAtCompany","PercentSalaryHike","NumCompaniesWorked","JobRole","YearsWithCurrManager"],out_file=f)

In [34]:
print("Classification Accuracy :", tree_model.score(X=predictors,y=encoded_data["Attrition"]))

Classification Accuracy : 0.9018140589569161
