# Decision tree

In [None]:
from sklearn import tree
X=[[0,0],[1,2]]
y=[0,1]
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X,y)

In [None]:
dtc.predict([[2,2]])

In [None]:
dtc.predict_proba([[2,2]])

In [None]:
dtc.predict([[0.4,1.2]])

In [None]:
dtc.predict_proba([[0,0.2]])

## Iris Dataset

In [None]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()

In [None]:
X = iris.data[:,2:]

In [None]:
y = iris.target

In [None]:
dtc = tree.DecisionTreeClassifier(random_state=42)

In [None]:
dtc = dtc.fit(X,y)

In [None]:
from sklearn.tree import export_graphviz

In [None]:
export_graphviz(dtc, out_file="tree.dot",
                feature_names=iris.feature_names[:2],
                class_names=iris.target_names,
                rounded=True,
                filled=True)

# run 
`dot -Tpng tree.dot -o tree.png`

<img src="tree.png">

## Graphviz

In [None]:
import graphviz

In [None]:
dot_data = tree.export_graphviz(dtc, out_file=None,
                            feature_names=iris.feature_names[2:],
                            class_names=iris.target_names,
                            rounded=True,
                            filled=True)

In [None]:
graph = graphviz.Source(dot_data)

In [None]:
graph

## Visualize the Decision Boundary

In [None]:
import numpy as np, seaborn as sns, matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline

In [None]:
df = sns.load_dataset('iris')
col = ['petal_length', 'petal_width']
X = df.loc[:,col]
species_to_num = {'setosa': 0, 'versicolor': 1, 'virginica': 2}
df['tmp'] = df['species'].map(species_to_num)
y = df['tmp']

In [None]:
dtc = tree.DecisionTreeClassifier()
dtc = dtc.fit(X,y)

In [None]:
Xv = X.values.reshape(-1,1)
h = 0.02
x_min, x_max = Xv.min(), Xv.max() + 1
y_min, y_max = y.min(), y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min,y_max,h))
z = dtc.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
fig=plt.figure(figsize=(12,8))
ax=plt.contourf(xx,yy,z,cmap='afmhot',alpha=0.3)
plt.scatter(X.values[:,0],X.values[:,1],c=y,s=80,alpha=0.9,edgecolors='g')

## Gini and Entropy

In [None]:
def gini(p):
    return 2*(p)*(1-p)
def entropy(p):
    return - p*np.log2(p) - (1-p)*np.log2((1-p))
def error(p):
    return 1 - np.max([p,1-p])

In [None]:
x = np.arange(0,1,0.01)
ent = [entropy(p) if p!= 0 else None for p in x]
sc_ent = [e*0.5 if e else None for e in ent]
err = [error(i) for i in x]

In [None]:
figure = plt.figure(figsize=(10,8))
ax = plt.subplot(111)
for i, lab, ls, c, in zip([ent, sc_ent, gini(x), err],
                         ['Entropy', 'Entropy (scaled)', 'Gini Impurity', 'Error'],
                         ['-','-','--','-'],
                         ['black', 'lightgray', 'red', 'green']):
    line = ax.plot(x,i,label=lab,linestyle=ls,lw=2,color=c)
ax.axhline(y=0.5, linewidth=1,color='k',linestyle='--')
ax.axhline(y=1.0, linewidth=1,color='k',linestyle='--')
plt.ylim([0,1.1])
plt.xlabel('p(i=1)')
plt.ylabel('impurity index')
plt.show()    
    

## Overfitting

In [None]:
dtc = tree.DecisionTreeClassifier(random_state=42)

In [None]:
dtc = dtc.fit(X,y)

In [None]:
dot_data = tree.export_graphviz(dtc, out_file=None,
                            feature_names=iris.feature_names[2:],
                            class_names=iris.target_names,
                            rounded=True,
                            filled=True)

In [None]:
graph = graphviz.Source(dot_data)

In [None]:
graph

## Modelling End-to-End with Decision Tree

In [None]:
from sklearn.datasets import make_moons

In [None]:
X_data, y_data = make_moons(n_samples=1000, noise=0.5, random_state=42)

In [None]:
dtc1 = tree.DecisionTreeClassifier(random_state=42)

In [None]:
dtc2 = tree.DecisionTreeClassifier(min_samples_leaf=10,random_state=42)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {'max_leaf_nodes': list(range(2,50)), 'min_samples_split': [2,3,4],'min_samples_leaf': [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]}

In [None]:
grid_search_cv = GridSearchCV(tree.DecisionTreeClassifier(random_state=42), params, n_jobs=-1,verbose=1)

In [None]:
grid_search_cv.fit(X_train,y_train)

In [None]:
grid_search_cv.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test,y_pred)

In [None]:
dtc1.fit(X_train, y_train)
y_pred = dtc1.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
dtc2.fit(X_train, y_train)
y_pred = dtc2.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
dtc1.get_params()

## Project HR

Download sample HR data from https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/ WA_Fn-UseC_-HR-Employee-Attrition.xlsx and save as csv.


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../WA_Fn-UseC_-HR-Employee-Attrition.csv")
df.head()

In [None]:
df.shape

In [None]:
df.pop('EmployeeCount')
df.pop('EmployeeNumber')
df.pop('Over18')
df.pop('StandardHours')

In [None]:
df.columns

In [None]:
y=df['Attrition']
X = df
X.pop('Attrition')

In [None]:
y.unique()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()


In [None]:
y=le.fit_transform(y)

In [None]:
y

In [None]:
y.shape

In [None]:
df.info()

In [None]:
df.select_dtypes(['object'])

In [None]:
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'],prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'],prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'],prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'],prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'],prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'],prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'],prefix='OverTime')

In [None]:
df1 =pd.concat([ind_BusinessTravel, ind_Department, ind_EducationField, ind_Gender, ind_JobRole,
                ind_MaritalStatus, ind_OverTime, df.select_dtypes(['int64'])]
               , axis=1)

In [None]:
df1

In [None]:
df1.shape

### Decision Tree

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df1,y)

In [None]:
dtc = tree.DecisionTreeClassifier(random_state=42)

In [None]:
dtc = dtc.fit(X_train, y_train)

In [None]:
dtc

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
accuracy_score(y_train,dtc.predict(X_train))

In [None]:
print(classification_report(y_train,dtc.predict(X_train)))

In [None]:
confusion_matrix(y_train,dtc.predict(X_train))

In [None]:
print(classification_report(y_test,dtc.predict(X_test)))

In [None]:
confusion_matrix(y_test,dtc.predict(X_test))