## Importing Libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline

## Importing Dataset

In [15]:
dataset = pd.read_csv('drug200.csv')

In [16]:
dataset.head(10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
5,22,F,NORMAL,HIGH,8.607,drugX
6,49,F,NORMAL,HIGH,16.275,drugY
7,41,M,LOW,HIGH,11.037,drugC
8,60,M,NORMAL,HIGH,15.171,drugY
9,43,M,LOW,NORMAL,19.368,drugY


In [17]:
dataset.shape

(200, 6)

## Extracting Independent Variables

In [18]:
X = dataset[['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K']].values
X[0:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.113999999999999],
       [28, 'F', 'NORMAL', 'HIGH', 7.797999999999999],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

## Data Preprocessing

In [19]:
from sklearn.preprocessing import LabelEncoder


In [21]:
encoded_sex = LabelEncoder()
encoded_sex.fit(['F', 'M'])  # here we are actually performing encoding
X[:, 1] = encoded_sex.transform(X[:, 1]) # here we are applying the calculated logic

# similarly doing for other categorical variables

encoded_bp = LabelEncoder() #here we creating a instance of label encoder
encoded_bp.fit(['LOW', 'NORMAL', 'HIGH'])
X[:, 2] = encoded_bp.transform(X[:, 2])

encoded_cholestrol = LabelEncoder()
encoded_cholestrol.fit(['NORMAL', 'HIGH'])
X[:, 3] = encoded_bp.transform(X[:, 3])

In [24]:
X[0:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.113999999999999],
       [28, 0, 2, 0, 7.797999999999999],
       [61, 0, 1, 0, 18.043]], dtype=object)

## Extracting Dependent Variable

In [28]:
y = dataset['Drug']
y[0:5] # to fetch top 5 rows

0    drugY
1    drugC
2    drugC
3    drugX
4    drugY
Name: Drug, dtype: object

## Splitting the Dataset into Training And Test Sets

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

## Implementing The Model

In [31]:
from sklearn.tree import DecisionTreeClassifier #importing the model

In [32]:
drug_tree = DecisionTreeClassifier() #creating the instance of the model

drug_tree.fit(X_train, y_train) #fitting the model with the training data

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [33]:
y_pred = drug_tree.predict(X_test) #predicting the output for the input test data


## Evaluating Our Model

In [35]:
from sklearn.metrics import accuracy_score
accuracy_of_model = accuracy_score(y_test, y_pred)
print('Accuracy of Model: {0}'.format(accuracy_of_model*100))

Accuracy of Model: 98.33333333333333
