# Decision Tree Classifier



In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import time

In [2]:
train =  pd.read_parquet('../../../data/model_input/train_sets/breast_cancer.parquet')
test =  pd.read_parquet('../../../data/model_input/validation_sets/breast_cancer.parquet')

In [3]:
y_train = train.diagnosis
X_train = train.drop(columns=['diagnosis'])

In [4]:
y_test = test.diagnosis
X_test = test.drop(columns=['diagnosis'])

We are fitting our trees varying only their maximum depth.

In [5]:
metrics = {}

In [6]:
for max_depth in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    start_time = time.time()
    dt = DecisionTreeClassifier(max_depth = max_depth)
    dt.fit(X_train, y_train);
    
    train_pred = dt.predict_proba(X_train)[:, 1]
    test_pred = dt.predict_proba(X_test)[:, 1]

    metrics['DT_'+ str(max_depth)] = {
        'Train_AUC': roc_auc_score(y_train, train_pred),
        'Test_AUC': roc_auc_score(y_test, test_pred),
        'Run_Time': time.time() - start_time,
    }

metrics_dt = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_AUC', 'Test_AUC'])
metrics_dt['delta%'] = 100*(metrics_dt.Test_AUC - metrics_dt.Train_AUC) / metrics_dt.Train_AUC
metrics_dt

Unnamed: 0,Run_Time,Train_AUC,Test_AUC,delta%
DT_1,0.015929,0.926936,0.937657,1.156541
DT_2,0.009973,0.976105,0.93703,-4.0031
DT_3,0.010003,0.990346,0.933271,-5.763125
DT_4,0.010973,0.999572,0.89317,-10.644722
DT_5,0.00994,1.0,0.89317,-10.682957
DT_6,0.010999,1.0,0.89317,-10.682957
DT_7,0.009975,1.0,0.89317,-10.682957
DT_8,0.010971,1.0,0.89317,-10.682957
DT_9,0.010939,1.0,0.89317,-10.682957
DT_10,0.009973,1.0,0.89317,-10.682957


In [7]:
metrics_dt.to_parquet('../../../data/metrics/breast_cancer/decision_tree.parquet')

These models tend to be overfitted, the best ones are the first 2 trees (maximum depths 1 and 2) because in terms of accuracy are not bad at all and their delta is lower than 10%, in fact, the tree of depth 1 had performed better in the test than in the train.

Trees of maximum depth 4 and above are way overfitted. Also the one with maximum depth 3 is a bit overfitted if we fix a bound of delta at 5%. The first tree is suspicious because we got better AUC in the test than in the train. By the way, the **best model** in terms of overfitting and accuracy is the model **DT_2**.