In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import roc_curve, plot_roc_curve, auc, roc_auc_score, classification_report, confusion_matrix, plot_confusion_matrix

import dtreeviz.trees as dtreeviz

import graphviz
from matplotlib import pyplot as plt
import pandas as pd
import utils
from sklearn.model_selection import KFold, StratifiedKFold



In [None]:
df = utils.feature_engineering(utils.get_data())

In [None]:
params = { 'max_depth': range(3,8), 'criterion': ('gini','entropy'), 'splitter':('best', 'random'), 'max_features':('sqrt', 'log2') }
model = DecisionTreeClassifier()
X = df.drop('tiene_alto_valor_adquisitivo', 1).copy()
y = df['tiene_alto_valor_adquisitivo']

best_params = []
best_scores = []
for k in range(5, 15):
    gscv = GridSearchCV(model, params, scoring='roc_auc', n_jobs=-1, cv=k)
    gscv.fit(X, y)
    best_scores.append(gscv.best_score_)
    print(k)

### Veamos si los folds parecen afectar a los scores

In [None]:
best_scores

Parece que no. Por lo que podemos decir que el decision tree efectivamente predice con un Score que ronda el 0.84.

# El árbol

Para ello utilizaremos el que mejor predice para K = 15, pues es el valor de K más grande y por lo tanto sobreestima menos el score

In [None]:
X = df.drop('tiene_alto_valor_adquisitivo', 1).copy()
y = df['tiene_alto_valor_adquisitivo']

gscv = GridSearchCV(model, params, scoring='roc_auc', n_jobs=-1, cv=15)
gscv.fit(X, y)

clf = DecisionTreeClassifier(criterion= gscv.best_params_['criterion'],
                       max_depth= gscv.best_params_['max_depth'],
                       max_features= gscv.best_params_['max_features'],
                       splitter=gscv.best_params_['splitter'])
clf.fit(X,y)
viz = dtreeviz.dtreeviz(
    clf,
    X,
    y,
    target_name='tiene_alto_valor_adquisitivo',
    feature_names=list(X.columns),
    class_names=list([0,1]),
    scale=1.5,
)

display(viz)

In [None]:
print(classification_report(y,gscv.predict(X)))

In [None]:
fig, ax = plt.subplots(figsize=(15,7))
plt.grid(False)
plot_confusion_matrix(gscv, X, y, cmap=plt.cm.Blues,  ax=ax)
plt.show()

In [None]:
plot_roc_curve(gscv, X, y)