In [None]:
# Copyright 2023 The ML Notebooks Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Decision Tree Classifier


This notebook uses the Wisconsin Breast Cancer dataset and demonstrates how to build a decision tree classifier to classify suspected cells to Benign or Malignant.


## Setup


In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

print("Finished importing...")

## Data Collection


### The Wisconsin Breast Cancer dataset

The dataset is available from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/).


### Get the data

Download and import the dataset using pandas.


In [None]:
# pylint: disable-next=line-too-long
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"

features = [
    "radius",
    "texture",
    "perimeter",
    "area",
    "smoothness",
    "compactness",
    "concavity",
    "concave_poinits",
    "symmetry",
    "fractal_dimension",
]
column_names = ["id", "diagnosis"]

for attr in ["mean", "ste", "largest"]:
    for feature in features:
        column_names.append(feature + "_" + attr)

df = pd.read_csv(path, names=column_names)
df.head()

## Exploratory Data Analysis (EDA)


In [None]:
sns.pairplot(df.iloc[:, 1:6], hue="diagnosis", diag_kind="kde")

In [None]:
df.describe().transpose()

## Data Preparation


### Clean the data

Check data types.


In [None]:
df.info()

Drop the `id` column since each row is unique


In [None]:
df = df.drop(columns=["id"])

Check missing/unknown values.


In [None]:
df.isna().sum()

The `"diagnosis"` column is categorical, not numeric. So, the next step is to ordinal encode the values.


In [None]:
enc = OrdinalEncoder()
df["diagnosis"] = enc.fit_transform(df["diagnosis"].to_numpy().reshape(-1, 1))

df.head()

### Split features from labels


In [None]:
features = df.drop(columns=["diagnosis"])
labels = df["diagnosis"]

### Split the data into development and test sets

Now split the dataset into a development set and a test set. You will use the test set in the final evaluation of your models.


In [None]:
dev_features, test_features, dev_labels, test_labels = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

### Normalize the data


In [None]:
scaler = StandardScaler()
dev_features = scaler.fit_transform(dev_features)
test_features = scaler.transform(test_features)

## Build a decision tree classifier


In [None]:
model = DecisionTreeClassifier(random_state=42)

## Train and evaluate your model


Evaluate model performance using cross-validation.


In [None]:
scores = cross_val_score(
    model,
    dev_features,
    dev_labels,
    scoring="accuracy",
    cv=5,
    error_score="raise",
)

print("Accuracy scores:\n", scores)
print("Mean Accuracy:\n", scores.mean())

Fit a decision tree classifier on the development data.


In [None]:
model.fit(dev_features, dev_labels)

Evaluate your model on the test data.


In [None]:
score = model.score(test_features, test_labels)
print("Accuracy:\n", score)

In [None]:
dev_predictions = model.predict(dev_features)
test_predictions = model.predict(test_features)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
ax1.set_title("Confusion matrix: Development")
ax2.set_title("Confusion matrix: Test")

cm = confusion_matrix(dev_labels, dev_predictions)
disp = ConfusionMatrixDisplay(cm)
disp.plot(ax=ax1)

cm = confusion_matrix(test_labels, test_predictions)
disp = ConfusionMatrixDisplay(cm)
disp.plot(ax=ax2)

## Conclusion

You have trained a simple decision tree classifier using scikit-learn.
