# Decision tree

In [2]:
from sklearn import tree

In [None]:
from sklearn.tree import export_graphviz

In [3]:
import graphviz

## Visualize the Decision Boundary

In [None]:
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
def gini(p):
    return 2*(p)*(1-p)
def entropy(p):
    return - p*np.log2(p) - (1-p)*np.log2((1-p))
def error(p):
    return 1 - np.max([p,1-p])

## Modelling End-to-End with Decision Tree

In [None]:
from sklearn.datasets import make_moons

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import accuracy_score

## Project HR

Download sample HR data from https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ WA_Fn-UseC_-HR-Employee-Attrition.xlsx and save as csv.


In [23]:
import pandas as pd

In [24]:
df = pd.read_csv("../WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [25]:
df.shape

(1470, 35)

In [26]:
df.pop('EmployeeCount')
df.pop('EmployeeNumber')
df.pop('Over18')
df.pop('StandardHours')

0       80
1       80
2       80
3       80
4       80
5       80
6       80
7       80
8       80
9       80
10      80
11      80
12      80
13      80
14      80
15      80
16      80
17      80
18      80
19      80
20      80
21      80
22      80
23      80
24      80
25      80
26      80
27      80
28      80
29      80
        ..
1440    80
1441    80
1442    80
1443    80
1444    80
1445    80
1446    80
1447    80
1448    80
1449    80
1450    80
1451    80
1452    80
1453    80
1454    80
1455    80
1456    80
1457    80
1458    80
1459    80
1460    80
1461    80
1462    80
1463    80
1464    80
1465    80
1466    80
1467    80
1468    80
1469    80
Name: StandardHours, Length: 1470, dtype: int64

In [27]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [28]:
y=df['Attrition']
X = df
X.pop('Attrition')

0       Yes
1        No
2       Yes
3        No
4        No
5        No
6        No
7        No
8        No
9        No
10       No
11       No
12       No
13       No
14      Yes
15       No
16       No
17       No
18       No
19       No
20       No
21      Yes
22       No
23       No
24      Yes
25       No
26      Yes
27       No
28       No
29       No
       ... 
1440     No
1441     No
1442    Yes
1443     No
1444    Yes
1445     No
1446     No
1447     No
1448     No
1449     No
1450     No
1451     No
1452    Yes
1453     No
1454     No
1455     No
1456     No
1457     No
1458     No
1459     No
1460     No
1461    Yes
1462     No
1463     No
1464     No
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object

In [29]:
y.unique()

array(['Yes', 'No'], dtype=object)

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()


In [12]:
y=le.fit_transform(y)

In [13]:
y

array([[1],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

In [14]:
y.shape

(1470, 1)

In [None]:
df.info()

In [None]:
df.select_dtypes(['object'])

In [None]:
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'],prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'],prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'],prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'],prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'],prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'],prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'],prefix='OverTime')

In [None]:
df1 =pd.concat([ind_BusinessTravel, ind_Department, ind_EducationField, ind_Gender, ind_JobRole,
                ind_MaritalStatus, ind_OverTime, df.select_dtypes(['int64'])]
               , axis=1)

In [None]:
df1

In [None]:
df1.shape

### Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1,y)

In [None]:
dtc = tree.DecisionTreeClassifier(random_state=42)

In [None]:
dtc = dtc.fit(X_train, y_train)

In [None]:
dtc

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
accuracy_score(y_train,dtc.predict(X_train))

In [None]:
print(classification_report(y_train,dtc.predict(X_train)))

In [None]:
confusion_matrix(y_train,dtc.predict(X_train))

In [None]:
print(classification_report(y_test,dtc.predict(X_test)))

In [None]:
confusion_matrix(y_test,dtc.predict(X_test))

In [None]:
def print_training_score(clf, X, y):
    print("Training Result:\n")
    print("Accuracy: {0:.4f}\n".format(accuracy_score(y, clf.predict(X))))
    print("Classification Report: \n {} \n".format(classification_report(y, clf.predict(X))))
    print("Confusion Matrix: \n {} \n".format(confusion_matrix(y, clf.predict(X))))

    res = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
    print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

In [None]:
def print_test_score(clf, X, y):
    print("Test Result:\n")

    print("Accuracy: {0:.4f}\n".format(accuracy_score(y, clf.predict(X))))
    print("Classification Report: \n {} \n".format(classification_report(y, clf.predict(X))))
    print("Confusion Matrix: \n {} \n".format(confusion_matrix(y, clf.predict(X))))

In [None]:
print_training_score(dtc, X_train, y_train)

In [None]:
print_test_score(dtc, X_test, y_test)

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag_clf = BaggingClassifier(base_estimator=dtc, n_estimators=5000, bootstrap=True, n_jobs=-1, random_state=42)

In [None]:
bag_clf.fit(X_train, y_train)

In [None]:
print_training_score(bag_clf, X_train, y_train)
print_test_score(bag_clf, X_test, y_test)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
print_training_score(rf_clf, X_train, y_train)
print_test_score(rf_clf, X_test, y_test)

In [None]:
import seaborn as sns

In [None]:
pd.Series(rf_clf.feature_importances_,
         index=X_train.columns).sort_values(ascending=False).plot(kind='bar', figsize=(12,6));

# Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier()

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
print_training_score(ada_clf, X_train, y_train)
print_test_score(ada_clf, X_test, y_test)

# Ada + RandomForest

In [None]:
arf_clf = AdaBoostClassifier(RandomForestClassifier())

In [None]:
arf_clf.fit(X_train, y_train)
print_training_score(arf_clf, X_train, y_train)
print_test_score(arf_clf, X_test, y_test)

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)
print_training_score(gbc_clf, X_train, y_train)
print_test_score(gbc_clf, X_test, y_test)

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
xg_clf = xgb.XGBClassifier()
xg_clf.fit(X_train, y_train)
print_training_score(xg_clf, X_train, y_train)
print_test_score(xg_clf, X_test, y_test)

In [30]:
from sklearn import preprocessing

In [31]:
le = preprocessing.LabelBinarizer()

In [32]:
tmp=le.fit_transform(y)

In [33]:
type(tmp)

numpy.ndarray

In [35]:
tmp = pd.Series(list(tmp))

In [36]:
tmp.value_counts()

[0]    1233
[1]     237
dtype: int64

In [37]:
tmp.value_counts() / tmp.count()

[0]    0.838776
[1]    0.161224
dtype: float64

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 30 columns):
Age                         1470 non-null int64
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome               1470 non-null int64
MonthlyRate                 1470 non-null int64
NumCompaniesWorked          1470 non-null int64
OverTime               