In [1]:
#In this lab exercise, you will learn a popular machine learning algorithm: DECISION TREE. 

#You will use this classification algorithm to build a model from historical data of patients, 
#and their response to different medications. 

#Then you use the trained decision tree to predict the class of a unknown patient, 
#or to find a proper drug for a new patient.

In [1]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [3]:
#About the dataset
#Imagine that you are a medical researcher compiling data for a study. You have collected data about a set of patients, 
#all of whom suffered from the same illness. 

#During their course of treatment, each patient responded to one of 5 medications, Drug A, Drug B, Drug c, Drug x and y.

#Part of your job is to build a model to find out which drug might be appropriate for a future patient with the same illness. 
#The feature sets of this dataset are Age, Sex, Blood Pressure, and Cholesterol of patients, 
#and the target is the drug that each patient responded to.

#It is a sample of binary classifier, and you can use the training part of the dataset to build a decision tree, 
#and then use it to predict the class of a unknown patient, or to prescribe it to a new patient.

In [None]:
#Downloading data
import wget
url='https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/drug200.csv'
wget.download(url, 'C:/Users/Maikol Cerda/Desktop/Python_exercise/12.- IBM machine learning with python/Videos/drug200.csv')

In [2]:
#Loading data
my_data = pd.read_csv('C:/Users/Maikol Cerda/Dropbox/2.-DATA SCIENCE COURSES/0.- COURSERA/1.- IBM machine learning with python/Videos/drug200.csv')

# take a look at the dataset
my_data.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [3]:
#What is the size of the data
size=len(my_data)
size

200

In [6]:
#Preprocessing
#Using my_data as the Drug.csv data read by pandas, declare the following variables:

#X as the Feature Matrix (data of my_data)
#y as the response vector (target)
#Remove the column containing the target name since it doesn't contain numeric values.

X = my_data[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [7]:
#Solving categorical data
#As you may figure out, some features in this dataset are categorical such as Sex or BP. 
#Unfortunately, Sklearn Decision Trees do not handle categorical variables. 
#But still we can convert these features to numerical values. 

#pandas.get_dummies() Convert categorical variable into dummy/indicator variables.

from sklearn import preprocessing
le_sex=preprocessing.LabelEncoder()
le_sex.fit(['F','M'])
X[:,1]=le_sex.transform(X[:,1])

le_BP=preprocessing.LabelEncoder()
le_BP.fit(['LOW','NORMAL','HIGH'])
X[:,2]=le_BP.transform(X[:,2])

le_Chol=preprocessing.LabelEncoder()
le_Chol.fit(['NORMAL','HIGH'])
X[:,3]=le_Chol.transform(X[:,3])


X[0:5]


array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [10]:
#Target variable
#Now we can fill the target variable.
y = my_data["Drug"]
y[0:5]

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

In [11]:
#SETTING UP DECISION TREE
from sklearn.model_selection import train_test_split

In [14]:
#Now train_test_split will return 4 different parameters. We will name them:
#X_trainset, X_testset, y_trainset, y_testset

#The train_test_split will need the parameters:
#X, y, test_size=0.3, and random_state=3.

#The X and y are the arrays required before the split, the test_size represents the ratio of the testing dataset, 
#and the random_state ensures that we obtain the same splits.

X_trainset,X_testset, y_trainset, y_testset= train_test_split(X,y, test_size=0.3, random_state=3)

In [15]:
X_trainset.shape

(140, 5)

In [17]:
#MODELLING
#We will first create an instance of the DecisionTreeClassifier called drugTree.
#Inside of the classifier, specify criterion="entropy" so we can see the information gain of each node.
drugTree= DecisionTreeClassifier(criterion='entropy', max_depth=4)

drugTree # it shows the default parameters

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [19]:
#Next, we will fit the data with the training feature matrix X_trainset and training response vector y_trainset
drugTree.fit(X_trainset, y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [21]:
#PREDICTION
#Let's make some predictions on the testing dataset and store it into a variable called predTree.
predTree=drugTree.predict(X_testset)

In [22]:
#You can print out predTree and y_testset if you want to visually compare the prediction to the actual values.
print (predTree [0:5])
print (y_testset [0:5])

['drugY' 'drugX' 'drugX' 'drugX' 'drugX']
40     drugY
51     drugX
139    drugX
197    drugX
170    drugX
Name: Drug, dtype: object


In [23]:
#EVALUATION
#Next, let's import metrics from sklearn and check the accuracy of our model.
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.9833333333333333


In [24]:
#VISUALIZATION
# Notice: You might need to uncomment and install the pydotplus and graphviz libraries 
#if you have not installed these before

# conda install -c conda-forge pydotplus
# conda install -c conda-forge python-graphviz -y

In [26]:
#Importing packages
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline


In [27]:
dot_data = StringIO()
filename = "drugtree.png"
featureNames = my_data.columns[0:5]
targetNames = my_data["Drug"].unique().tolist()
out=tree.export_graphviz(drugTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

InvocationException: GraphViz's executables not found