In [27]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [28]:
data = pd.read_csv('wdbc.data', header=None)

In [29]:
# Shuffle the DataFrame rows 
data = data.sample(frac = 1) 

In [30]:
# Pre-process the data 
X = data.drop(data.columns[1], axis=1)
X = np.array(X)
X = X.astype(float) 
Y = data[1]
# Convert benign and malignant characters to 0's and 1's from B's and M's
# for i, val in enumerate(Y):
#     if val == "B":
#         Y[i] = 0
#     else:
#         Y[i] = 1
# Scikit learn label encoder can also be used for this purpose
le_Y = LabelEncoder()
Y = le_Y.fit_transform(Y)
# Converting into a numpy array 
Y = np.array(Y)
# Cast a numpy array to a specified float dtype, else unknown error will be thrown
Y = Y.astype(float) 

In [31]:
# Custom Method to split data into 70% Train and 30% Test data
# def split_data(df, X, Y):
#     arr_rand = np.random.rand(df.shape[0])
#     split = arr_rand < np.percentile(arr_rand, 80)
    
#     X_train = X[split]
#     Y_train = Y[split]
#     X_test =  X[~split]
#     Y_test = Y[~split]

#     return X_train, Y_train, X_test, Y_test

# We can also split using a custom function
# X_train, Y_train, X_test, Y_test = split_data(data, X, Y) 

In [32]:
# Using skikit learn train_test_split function
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [33]:
# Fitting a decision tree classifier
clf_dt = DecisionTreeClassifier(random_state = 0, criterion="entropy")
clf_dt.fit(X_train, Y_train)
# Predicting the y-values
Y_prediction_dt = clf_dt.predict(X_test)

In [34]:
# Getting the accuracy of the model
print('Accuracy of Decision Tree: ', accuracy_score(Y_test, Y_prediction_dt) * 100, '%')
# 0.92

# # Getting the confusion matrix
print('Confusion Matrix for Decision Tree:\n', confusion_matrix(Y_test, Y_prediction_dt))

# # Precision score
print('Precision Score for Decision Tree: ', precision_score(Y_test, Y_prediction_dt))

# # Recall score
print('Recall Score for Decision Tree: ', recall_score(Y_test, Y_prediction_dt))

# F-1 Score
print('F-1 Score: ', f1_score(Y_test, Y_prediction_dt))

# TRUE POSITIVE

Accuracy of Decision Tree:  93.85964912280701 %
Confusion Matrix for Decision Tree:
 [[60  1]
 [ 6 47]]
Precision Score for Decision Tree:  0.9791666666666666
Recall Score for Decision Tree:  0.8867924528301887
F-1 Score:  0.9306930693069307


In [35]:
# Random forest classifier for 200 trees
clf_rf = RandomForestClassifier(n_estimators = 200, random_state = 0, criterion="entropy")
clf_rf.fit(X_train, Y_train)
# Predicting the y-values
Y_prediction_rf = clf_rf.predict(X_test)

In [36]:
# Getting the accuracy of the model
print('Accuracy of Random Forest: ', accuracy_score(Y_test, Y_prediction_rf) * 100, '%')
# 0.97

# # Getting the confusion matrix
print('Confusion Matrix for Random Forest:\n', confusion_matrix(Y_test, Y_prediction_rf))

# # Precision score
print('Precision Score for Random Forest: ', precision_score(Y_test, Y_prediction_rf))

# # Recall score
print('Recall Score for Random Forest: ', recall_score(Y_test, Y_prediction_rf))

# F-1 score
print('F-1 score: ', f1_score(Y_test, Y_prediction_rf))

# # TRUE POSITIVE

Accuracy of Random Forest:  96.49122807017544 %
Confusion Matrix for Random Forest:
 [[61  0]
 [ 4 49]]
Precision Score for Random Forest:  1.0
Recall Score for Random Forest:  0.9245283018867925
F-1 score:  0.9607843137254902
