In [1]:
#modules for data manipulation
import pandas as pd

In [2]:
attribute_num = 10 #number of attributes

df = pd.read_csv('data.csv', delimiter = ",", low_memory=False)
df.drop('id', axis=1, inplace=True) #remove id column

X = df.iloc[:, 1 : (1 + attribute_num)] #select only the columns containing mean attribute data
y = df.iloc[:, 0 : 1] #select only the first column which contains diagnosis data

attribute_list = list(X.head())
print('Here are the attributes:', attribute_list)

Here are the attributes: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']


In [3]:
#modules for generating the tree
from sklearn import tree, preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics

In [4]:
sc = StandardScaler()
sc.fit(X)
df_train = sc.transform(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()

#process y_train and y_test for visualization such as convert diagnosis from B/M to 0/1
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y_train.diagnosis)
y_train['diagnosis'] = label_encoder.transform(y_train.diagnosis)
y_test['diagnosis'] = label_encoder.transform(y_test.diagnosis)

clf = tree.DecisionTreeClassifier(criterion = "entropy", max_leaf_nodes=6, random_state = 0)
clf.fit(X_train_array, y_train)

DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=6, random_state=0)

In [6]:
print(f"Train - Accuracy: {round(metrics.accuracy_score(y_train, clf.predict(X_train_array))*100,2)}%.")
print()
print("Train - Confusing matrix:")
print(metrics.confusion_matrix(y_train, clf.predict(X_train_array)))
print()
print("Train - Classification report:")
print(metrics.classification_report(y_train, clf.predict(X_train_array)))

Train - Accuracy: 93.72%.

Train - Confusing matrix:
[[230  19]
 [  6 143]]

Train - Classification report:
              precision    recall  f1-score   support

           0       0.97      0.92      0.95       249
           1       0.88      0.96      0.92       149

    accuracy                           0.94       398
   macro avg       0.93      0.94      0.93       398
weighted avg       0.94      0.94      0.94       398



In [7]:
print(f"Test - Accuracy: {round(metrics.accuracy_score(y_test, clf.predict(X_test_array))*100,2)}%.")
print()
print("Test - Confusing matrix:")
print(metrics.confusion_matrix(y_test, clf.predict(X_test_array)))
print()
print("Test - Classification report:")
print(metrics.classification_report(y_test, clf.predict(X_test_array)))

Test - Accuracy: 88.89%.

Test - Confusing matrix:
[[94 14]
 [ 5 58]]

Test - Classification report:
              precision    recall  f1-score   support

           0       0.95      0.87      0.91       108
           1       0.81      0.92      0.86        63

    accuracy                           0.89       171
   macro avg       0.88      0.90      0.88       171
weighted avg       0.90      0.89      0.89       171



In [8]:
import graphviz #load tree visualization package

#generate DOT data
output_data2 = tree.export_graphviz(clf, out_file = None,
                feature_names = attribute_list,
                class_names = ['Benign', 'Malignant'],
                filled = True, rounded = True, special_characters = True)

graph = graphviz.Source(output_data2) #draw graph
graph.render("decision_tree_graphivz", view=True) #PDF output and open tree in new tab

'decision_tree_graphivz.pdf'

In [9]:
from dtreeviz.trees import * #load tree visualization package

#generate DOT data
viz = dtreeviz(clf, X_train, y_train.diagnosis,
                target_name = "State",
                feature_names = X_train.columns, histtype='barstacked',
                class_names = ['Benign', 'Malignant'],
                title = "Decision Tree - Cancer data set")

viz.view() #open tree in new tab
viz.save("decision_tree_dtreeviz.svg") #SVG output