In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [2]:
# filename="https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv"
# df=pd.read_csv(filename)

In [3]:
# import wget
# filename="https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv"
# wget.download(filename)

<div id="about_dataset">
    <h2>About the dataset</h2>
    Imagine that you are a medical researcher compiling data for a study. You have collected data about a set of patients, all of whom suffered from the same illness. During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y. 
    <br>
    <br>
    Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. The feature sets of this dataset are Age, Sex, Blood Pressure, and Cholesterol of patients, and the target is the drug that each patient responded to.
    <br>
    <br>
    It is a sample of binary classifier, and you can use the training part of the dataset 
    to build a decision tree, and then use it to predict the class of a unknown patient, or to prescribe it to a new patient.
</div>


In [2]:
df=pd.read_csv("drug200.csv")
df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [5]:
df.shape
# df["Cholesterol"].value_counts

(200, 6)

<h2>Preprocessing</h2>

In [3]:
Xi=list(df.columns[:-1])
X=df[Xi].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

As you may figure out, some features in this dataset are categorical such as __Sex__ or __BP__. Unfortunately, Sklearn Decision Trees do not handle categorical variables. But still we can convert these features to numerical values. __pandas.get_dummies()__
Convert categorical variable into dummy/indicator variables.

In [4]:
# Do not execute the same LabelEncoder object, without clearing its previous output
# As it already contain previously unseen labels
from sklearn import preprocessing
le_sex=preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1]=le_sex.transform(X[:,1])#select second column of each sub-array acc. to index
# le_BP=preprocessing.LabelEncoder()
# le_BP.fit(["LOW","NORMAL","HIGH"])
# X[:,2]=le_BP.transform(X[:,2])#select/manipulate third column, edit third column of each sub-array
le_Chol=preprocessing.LabelEncoder()
le_Chol.fit(["NORMAL","HIGH"])
X[:,3]=le_Chol.transform(X[:,3])
X[0:5]

array([[23, 0, 'HIGH', 0, 25.355],
       [47, 1, 'LOW', 0, 13.093],
       [47, 1, 'LOW', 0, 10.113999999999999],
       [28, 0, 'NORMAL', 0, 7.797999999999999],
       [61, 0, 'LOW', 0, 18.043]], dtype=object)

In [5]:
le_BP=preprocessing.LabelEncoder().fit(["LOW","NORMAL","HIGH"])
# le_BP.fit(["LOW","NORMAL","HIGH"])
X[:,2]=le_BP.transform(X[:,2])
X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

Now we can fill the target variable.

In [8]:
Y=df["Drug"]
Y.head()

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

<hr>

<div id="setting_up_tree">
    <h2>Setting up the Decision Tree</h2>
    We will be using <b>train/test split</b> on our <b>decision tree</b>. Let's import <b>train_test_split</b> from <b>sklearn.model_selection</b>.
</div>

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test=train_test_split(X,Y,test_size=0.3,random_state=3)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(140, 5)
(60, 5)
(140,)
(60,)


<hr>

<div id="modeling">
    <h2>Modeling</h2>
    We will first create an instance of the <b>DecisionTreeClassifier</b> called <b>drugTree</b>.<br>
    Inside of the classifier, specify <i> criterion="entropy" </i> so we can see the information gain of each node.
</div>

In [22]:
drugTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
# The maximum depth of the tree. If None, then nodes are expanded until 
# all leaves are pure or until all leaves contain less than min_samples_split
# samples. The minimum number of samples required to split an internal node
drugTree # it shows the default parameters/

DecisionTreeClassifier(criterion='entropy', max_depth=4)

Next, we will fit the data with the training feature matrix X_trainset and training response vector y_trainset

In [11]:
drugTree.fit(X_train,Y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=10)

<hr>

<div id="prediction">
    <h2>Prediction</h2>
    Let's make some <b>predictions</b> on the testing dataset and store it into a variable called <b>predTree</b>.
</div>

In [12]:
predTree = drugTree.predict(X_test)

You can print out <b>predTree</b> and <b>y_testset</b> if you want to visually compare the prediction to the actual values.

In [13]:
print (predTree [0:5])
print (Y_test [0:5])


['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


<hr>

<div id="evaluation">
    <h2>Evaluation</h2>
    Next, let's import <b>metrics</b> from sklearn and check the accuracy of our model.
</div>

In [14]:
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(Y_test, predTree))

DecisionTrees's Accuracy:  0.9833333333333333


__Accuracy classification score__ computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.  

In multilabel classification, the function returns the subset accuracy. If the entire set of predicted labels for a sample strictly match with the true set of labels, then the subset accuracy is 1.0; and if none is matched it is 0.0.


## Practice 
Can you calculate the accuracy score without sklearn ?

In [15]:
# type(predTree)
# list(predTree).zip(Y_test)
# type(list(x))
# x[0:5]
x=zip(predTree,Y_test)
x=list(x)
count=len(list(filter(lambda i:i[0]==i[1],x)))
# count
# print(len(predTree),len(Y_test)) #60  60
print("Accuracy score without accuracy_score function",count/predTree.shape[0])
# tuple(a).zip(tuple(b))

Accuracy score without accuracy_score function 0.9833333333333333


<hr>

<div id="visualization">
    <h2>Visualization</h2>
    Lets visualize the tree
</div>

In [17]:
# !pip install six

In [18]:
# !pip install pydotplus
# !pip install graphviz ..

In [19]:
# from sklearn.externals.six import StringIO  #in python 2
from six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
# %matplotlib inline 

In [20]:
# !conda install python-graphviz


In [6]:
# dot_data = StringIO()
# filename = "drugtree.png"
# featureNames = df.columns[0:5]
# targetNames = df["Drug"].unique().tolist()
# out=tree.export_graphviz(drugTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(Y_train), filled=True,  special_characters=True,rotate=False)  
# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# graph.write_png(filename)
# img = mpimg.imread(filename)
# plt.figure(figsize=(100, 200))
# plt.imshow(img,interpolation='nearest')

In [None]:
dot_data=StringIO()
filename="practiceDrug.png"
featureNames=df.columns[0:5]
targetNames=df["Drug"].unique().tolist()
out=tree.export_graphviz(drugTree,
                        out_file=dot_data,
                        feature_names=featureNames,
                        class_names=np.unique(Y_train),
                        filled=True,
                        special_characters=False,
                        rotate=False)
graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png(filename)
img=mpimg.imread(filename)
plt.figure(figsize=(100,200))
plt.imshow(img,interpolation="nearest")