# Decision tree

In [None]:
from sklearn import tree

In [None]:
from sklearn.tree import export_graphviz

In [None]:
import graphviz

## Visualize the Decision Boundary

In [None]:
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
def gini(p):
    return 2*(p)*(1-p)
def entropy(p):
    return - p*np.log2(p) - (1-p)*np.log2((1-p))
def error(p):
    return 1 - np.max([p,1-p])

## Modelling End-to-End with Decision Tree

In [None]:
from sklearn.datasets import make_moons

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.metrics import accuracy_score

## Project HR

Download sample HR data from https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ WA_Fn-UseC_-HR-Employee-Attrition.xlsx and save as csv.


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

In [None]:
df.shape

In [None]:
df.pop('EmployeeCount')
df.pop('EmployeeNumber')
df.pop('Over18')
df.pop('StandardHours')

In [None]:
df.columns

In [None]:
y=df['Attrition']
X = df
X.pop('Attrition')

In [None]:
y.unique()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()


In [None]:
y=le.fit_transform(y)

In [None]:
y

In [None]:
y.shape

In [None]:
df.info()

In [None]:
df.select_dtypes(['object'])

In [None]:
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'],prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'],prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'],prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'],prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'],prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'],prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'],prefix='OverTime')

In [None]:
df1 =pd.concat([ind_BusinessTravel, ind_Department, ind_EducationField, ind_Gender, ind_JobRole,
                ind_MaritalStatus, ind_OverTime, df.select_dtypes(['int64'])]
               , axis=1)

In [None]:
df1

In [None]:
df1.shape

### Decision Tree

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1,y)

In [None]:
dtc = tree.DecisionTreeClassifier(random_state=42)

In [None]:
dtc = dtc.fit(X_train, y_train)

In [None]:
dtc

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
accuracy_score(y_train,dtc.predict(X_train))

In [None]:
print(classification_report(y_train,dtc.predict(X_train)))

In [None]:
confusion_matrix(y_train,dtc.predict(X_train))

In [None]:
print(classification_report(y_test,dtc.predict(X_test)))

In [None]:
confusion_matrix(y_test,dtc.predict(X_test))

In [None]:
def print_training_score(clf, X, y):
    print("Training Result:\n")
    print("Accuracy: {0:.4f}\n".format(accuracy_score(y, clf.predict(X))))
    print("Classification Report: \n {} \n".format(classification_report(y, clf.predict(X))))
    print("Confusion Matrix: \n {} \n".format(confusion_matrix(y, clf.predict(X))))

    res = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
    print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))

In [None]:
def print_test_score(clf, X, y):
    print("Test Result:\n")

    print("Accuracy: {0:.4f}\n".format(accuracy_score(y, clf.predict(X))))
    print("Classification Report: \n {} \n".format(classification_report(y, clf.predict(X))))
    print("Confusion Matrix: \n {} \n".format(confusion_matrix(y, clf.predict(X))))

In [None]:
print_training_score(dtc, X_train, y_train)

In [None]:
print_test_score(dtc, X_test, y_test)

## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag_clf = BaggingClassifier(base_estimator=dtc, n_estimators=5000, bootstrap=True, n_jobs=-1, random_state=42)

In [None]:
bag_clf.fit(X_train, y_train)

In [None]:
print_training_score(bag_clf, X_train, y_train)
print_test_score(bag_clf, X_test, y_test)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier()

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
print_training_score(rf_clf, X_train, y_train)
print_test_score(rf_clf, X_test, y_test)

In [None]:
import seaborn as sns

In [None]:
pd.Series(rf_clf.feature_importances_,
         index=X_train.columns).sort_values(ascending=False).plot(kind='bar', figsize=(12,6));

# Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier()

In [None]:
ada_clf.fit(X_train, y_train)

In [None]:
print_training_score(ada_clf, X_train, y_train)
print_test_score(ada_clf, X_test, y_test)

# Ada + RandomForest

In [None]:
arf_clf = AdaBoostClassifier(RandomForestClassifier())

In [None]:
arf_clf.fit(X_train, y_train)
print_training_score(arf_clf, X_train, y_train)
print_test_score(arf_clf, X_test, y_test)

## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc_clf = GradientBoostingClassifier()
gbc_clf.fit(X_train, y_train)
print_training_score(gbc_clf, X_train, y_train)
print_test_score(gbc_clf, X_test, y_test)

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
xg_clf = xgb.XGBClassifier()
xg_clf.fit(X_train, y_train)
print_training_score(xg_clf, X_train, y_train)
print_test_score(xg_clf, X_test, y_test)

In [None]:
from sklearn import preprocessing

In [None]:
le = preprocessing.LabelBinarizer()

In [None]:
tmp=le.fit_transform(y)

In [None]:
type(tmp)

In [None]:
tmp = pd.Series(list(tmp))

In [None]:
tmp.value_counts()

In [None]:
tmp.value_counts() / tmp.count()

In [None]:
df.info()