In [None]:
'''
This Jupyter Notebook displays some examples to practically implement
Decision Trees. 
@Reference: 
http://scikit-learn.org/stable/modules/tree.html
'''

In [98]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

In [66]:
# Load iris dataset
iris = load_iris();

# Create Dataframe for iris features
df_features = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Create Dataframe for iris target
df_target = pd.DataFrame(data=iris.target, columns=['target'])

In [25]:
df_features.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [36]:
df_target.head()

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0


In [76]:
# 0 => 'setosa', 1 => 'versicolor', 2 => 'virginica'
df = pd.concat([df_features, df_target], axis=1)

# Shuffle the dataset
df = shuffle(df)

df_target = df['target']
df_features = df[iris.feature_names]

In [78]:
df.count()

sepal length (cm)    150
sepal width (cm)     150
petal length (cm)    150
petal width (cm)     150
target               150
dtype: int64

In [77]:
# Separate training and validation examples
# 100/150 => training and 50/150 => validation
train_feature_data = df_features.iloc[0:100]
train_target_data = df_target.iloc[0:100]
test_feature_data = df_features.iloc[100:150]
test_target_data = df_target.iloc[100:150]

In [79]:
# Create classifier and fit the training data in it
clf = tree.DecisionTreeClassifier()
clf.fit(train_feature_data, train_target_data)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [80]:
# Predict
predict_target = clf.predict(test_feature_data)

In [99]:
# Calculate accuracy score
accuracy_score(test_target_data, predict_target)

0.95999999999999996

In [101]:
# returns target value for the given
# target
def getTargetValues(target):
    if target == 0:
        return 'setosa'
    elif target == 1:
        return 'versicolor'
    else:
        return 'virginica'

In [102]:
df_predict = pd.DataFrame(predict_target)

In [104]:
# Get target values for each target
df_predict = df_predict.applymap(getTargetValues)

In [106]:
predict_target

array([2, 1, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 1, 0, 2, 2, 1, 0, 2, 2, 0, 1, 2,
       0, 2, 1, 2, 2, 0, 1, 0, 1, 1, 0, 2, 2, 2, 0, 1, 1, 2, 1, 1, 0, 0, 2,
       0, 0, 1, 1])

In [107]:
df_predict.head()

Unnamed: 0,0
0,virginica
1,versicolor
2,versicolor
3,virginica
4,setosa
