In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt
%matplotlib inline

### Reading the dataset

In [None]:
# Reading CSV
cancer_df = pd.read_csv('./data/cancer.csv', sep = ',')

In [None]:
# Deleting unused column
del cancer_df['Unnamed: 32']

In [None]:
cancer_df.shape

In [None]:
cancer_df.dtypes

In [None]:
cancer_df.head()

### Splitting train and test datasets

In [None]:
# Selecting features to train the model
features = cancer_df.columns.tolist()[2:]

In [None]:
# Creating feature matrix
X = cancer_df[features]

In [None]:
# Creating target
Y = cancer_df['diagnosis'].replace({'B': 0, 'M': 1})

In [None]:
pd.Series(Y).value_counts(normalize = True)

In [None]:
# Splitting feature matrix for training (70%) and test (30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y)

In [None]:
pd.Series(Y_train).value_counts(normalize = True)

In [None]:
pd.Series(Y_test).value_counts(normalize = True)

### Training the model

In [None]:
# Initializing the model
model = DecisionTreeClassifier()

In [None]:
# Training the model using training dataset
model.fit(X_train, Y_train)

In [None]:
# Plotting the tree model
plt.figure(figsize = (30, 20))
plot_tree(model, feature_names = features, class_names = ['B', 'M'])
plt.show()

### Evaluating the model

In [None]:
# Predicting for test dataset
predictions = model.predict(X_test)

In [None]:
# Plotting confusion matrix
ConfusionMatrixDisplay.from_predictions(Y_test, predictions)

In [None]:
# Calculating confusion matrix derived metrics
print('Precision:', precision_score(Y_test, predictions))
print('Recall:', recall_score(Y_test, predictions))
print('F1:', f1_score(Y_test, predictions))