# Decision Tree Classifier



In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import time

In [2]:
train =  pd.read_parquet('../../../data/model_input/train_sets/breast_cancer.parquet')
test =  pd.read_parquet('../../../data/model_input/validation_sets/breast_cancer.parquet')

In [3]:
y_train = train.diagnosis
X_train = train.drop(columns=['diagnosis'])

In [4]:
y_test = test.diagnosis
X_test = test.drop(columns=['diagnosis'])

We are fitting our trees varying only their maximum depth.

In [5]:
metrics = {}

In [6]:
for max_depth in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    start_time = time.time()
    dt = DecisionTreeClassifier(max_depth = max_depth)
    dt.fit(X_train, y_train);
    
    train_pred = dt.predict_proba(X_train)[:, 1]
    test_pred = dt.predict_proba(X_test)[:, 1]

    metrics['DT_'+ str(max_depth)] = {
        'Train_Gini': 2*roc_auc_score(y_train, train_pred)-1,
        'Test_Gini': 2*roc_auc_score(y_test, test_pred)-1,
        'Run_Time': time.time() - start_time,
    }

metrics_dt = pd.DataFrame.from_dict(metrics, orient='index',columns=['Run_Time', 'Train_Gini', 'Test_Gini'])
metrics_dt['delta%'] = 100*(metrics_dt.Test_Gini - metrics_dt.Train_Gini) / metrics_dt.Train_Gini
metrics_dt

Unnamed: 0,Run_Time,Train_Gini,Test_Gini,delta%
DT_1,0.022154,0.853872,0.875313,2.511007
DT_2,0.009974,0.952209,0.87406,-8.207114
DT_3,0.010971,0.980691,0.866541,-11.639721
DT_4,0.011967,0.999144,0.786341,-21.298562
DT_5,0.010971,1.0,0.786341,-21.365915
DT_6,0.012965,1.0,0.786341,-21.365915
DT_7,0.019947,1.0,0.786341,-21.365915
DT_8,0.01895,1.0,0.786341,-21.365915
DT_9,0.019946,1.0,0.786341,-21.365915
DT_10,0.015958,1.0,0.786341,-21.365915


In [7]:
metrics_dt.to_parquet('../../../data/metrics/breast_cancer/decision_tree.parquet')

These models tend to be overfitted, the best ones are the first 2 trees (maximum depths 1 and 2) because in terms of accuracy are not bad at all and their delta is lower than 10%, in fact, the tree of depth 1 had performed better in the test than in the train.