 # Decision Tree 
 ## Attrition

---

In [2]:
import numpy as np
import pandas as pd 
from sklearn import tree
from sklearn import preprocessing

In [3]:
dataset = pd.read_csv('general_data.csv')
dataset.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


### Checking Null values

In [6]:
dataset.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [4]:
dataset.isna().sum().sum()

28

In [8]:
dataset = dataset.fillna(dataset.mean())

In [9]:
dataset.isna().sum().sum()

0

### Droping the unnecessary records

In [11]:
dataset.drop(['BusinessTravel','Department','EducationField','JobRole','MaritalStatus','Over18'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeCount,EmployeeID,Gender,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,6,2,1,1,Female,1,131160,1.0,11,8,0,1.0,6,1,0,0
1,31,Yes,10,1,1,2,Female,1,41890,0.0,23,8,1,6.0,3,5,1,4
2,32,No,17,4,1,3,Male,4,193280,1.0,15,8,3,5.0,2,5,0,3
3,38,No,2,5,1,4,Male,3,83210,3.0,11,8,3,13.0,5,8,7,5
4,32,No,10,1,1,5,Male,1,23420,4.0,12,8,2,9.0,2,6,0,4


### Convert Gender and Attrition into 0 or 1

In [22]:
label_encoder = preprocessing.LabelEncoder()
dataset['Gender'] = label_encoder.fit_transform(dataset['Gender'])
dataset['Attrition'] = label_encoder.fit_transform(dataset['Attrition'])

---

# Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
dataset.columns

Index(['Age', 'Attrition', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeID', 'Gender', 'JobLevel', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

### -  define random forest

In [27]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)
features = ['Age','DistanceFromHome','Education','EmployeeCount','EmployeeID','Gender','JobLevel','MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','StandardHours','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear','YearsAtCompany','YearsSinceLastPromotion','YearsWithCurrManager']

In [28]:
rf_model.fit( X =dataset[features], y=dataset['Attrition'])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

### Accuracy

In [30]:
print('Accuracy of 17 IDV and 1 DV ')
print('OOB Accuracy :',rf_model.oob_score_)

Accuracy of 17 IDV and 1 DV 
OOB Accuracy : 0.999092970521542


### - Imp varaible

In [32]:
for features,imp in zip (features,rf_model.feature_importances_) :
    print(features,imp)

Age 0.11717072371603994
DistanceFromHome 0.08607253458144826
Education 0.04779348207080764
EmployeeCount 0.0
EmployeeID 0.05143330095677578
Gender 0.01979933765524327
JobLevel 0.04465673914441285
MonthlyIncome 0.11929866812139227
NumCompaniesWorked 0.0635625965861748
PercentSalaryHike 0.07886577241993077
StandardHours 0.0
StockOptionLevel 0.0397723734202081
TotalWorkingYears 0.0953677556135933
TrainingTimesLastYear 0.05297984188551231
YearsAtCompany 0.07506760235203377
YearsSinceLastPromotion 0.04749447581874078
YearsWithCurrManager 0.06066479565768615


---

# Decision Tree 

In [50]:
predictors = pd.DataFrame([dataset['StandardHours'],dataset['EmployeeCount'],dataset['MonthlyIncome'],dataset['Age']]).T
tree_model = tree.DecisionTreeClassifier(max_depth=8)
tree_model.fit(X=predictors, y=dataset['Attrition'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

### - Graphviz

In [36]:
with open('Dtree1.dot','w') as f :
    f = tree.export_graphviz(tree_model, feature_names=['EmployeeCount','MonthlyIncome','StandardHours','Age'], out_file=f);

### - Accuracy

In [51]:
Accuracy = tree_model.score(X=predictors, y=dataset['Attrition'])
Accuracy

0.8775510204081632