In [None]:
%matplotlib inline
import matplotlib as mpl
# mpl.use('Agg')
from matplotlib import rcParams
# rcParams.update({'figure.autolayout': True})
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set_style("white", {'legend.frameon': True})
sns.set_style("ticks", {'legend.frameon': True})
sns.set_context("talk")
sns.set_palette('Dark2', 8,desat=1)
cc = sns.color_palette()

from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestRegressor,HistGradientBoostingRegressor,RandomForestClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

from sklearn.tree import export_graphviz
from io import StringIO
from IPython.display import Image  
import pydotplus
from sklearn import tree
from astropy.table import Table
# import graphviz
from joblib import parallel_backend

In [None]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer(as_frame=True)
df = data.frame
df

In [None]:
df.describe()
#Target 1 -> Benigno. 

In [None]:
#Función para obtener algunas métricas de los árboles de decisión. 
def GetMetrics(y_test, y_pred,clf,Output):
    Accuracy = round(metrics.accuracy_score(y_test, y_pred),3)
    Precision = round(metrics.precision_score(y_test, y_pred),3)
    Recall = round(metrics.recall_score(y_test, y_pred),3)
    print("Accuracy:",Accuracy)
    print("Precision:",Precision)
    print("Recall:",Recall)
    Depth = clf.get_depth()
    NLeaves = clf.get_n_leaves()
    Nodes = clf.tree_.node_count
    TerminalNodes = clf.tree_.node_count-clf.get_n_leaves()
    
    MaxFeatures = clf.max_features_
    NFeaturesIn = clf.n_features_in_
    FeaturesNames = clf.feature_names_in_
    
    cm = metrics.confusion_matrix(y_test, y_pred, labels=clf.classes_,normalize='all')
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
    disp.plot()
    disp.ax_.set_xlabel('Clase Predicha')
    disp.ax_.set_ylabel('Clase Real')    
    disp.figure_.subplots_adjust(left=0.05, bottom=0.15, right=0.98, top=0.95,wspace=0.00, hspace=0.05)
    disp.figure_.savefig(Output)
    plt.show()
    return Accuracy,Precision,Depth,NLeaves,Nodes,TerminalNodes,MaxFeatures,NFeaturesIn,FeaturesNames

In [None]:
y = df['target'] # Target variable
columns_2_use = df.columns.tolist()

columns_2_use.remove('target')
X = df[columns_2_use] # Features

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=np.random.randint(0,2e8)) # 50% training and 50% test
len(X),len(y),len(X_train),len(y_train),len(X_test),len(y_test)

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion='gini',
                             max_depth=2,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             ccp_alpha=0)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
print(GetMetrics(y_test, y_pred,clf,'CM_gini.pdf'))

In [None]:
dot_data = StringIO()

export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = columns_2_use,class_names=['0','1'],proportion=True,rotate=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('Tree1.png')
Image(graph.create_png())

In [None]:
Acc_array = []
Prec_array = []
Rec_array = []
Max_Depth_Array = np.arange(1,31,1)
for max_depth in Max_Depth_Array:
    
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion='gini',
                                 max_depth=max_depth,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 ccp_alpha=0)
    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    Accuracy = metrics.accuracy_score(y_test, y_pred)
    Precision = metrics.precision_score(y_test, y_pred)
    Recall = metrics.recall_score(y_test, y_pred)
    Acc_array.append(Accuracy)
    Prec_array.append(Precision)
    Rec_array.append(Recall)

In [None]:
w, h = 1.0*plt.figaspect(0.9)
fig1 = plt.figure(figsize=(w,h))
fig1.subplots_adjust(left=0.16, bottom=0.16, right=0.98, top=0.95,wspace=0.00, hspace=0.05)
ax = fig1.gca()


plt.plot(Max_Depth_Array,Acc_array,'-',label='Accuracy')
plt.plot(Max_Depth_Array,Prec_array,'-',label='Precision')
plt.plot(Max_Depth_Array,Rec_array,'-',label='Recall')
plt.xlabel('max_depth')
plt.ylabel('Métricas')
plt.legend(loc=0)
# plt.savefig('Toomre_v1.pdf')
plt.show()
print('max_depth:',Max_Depth_Array[np.argmax(Acc_array)])

In [None]:
Acc_array = []
Prec_array = []
Rec_array = []
ccp_alpha_Array = np.arange(0,0.01,0.001)
for ccp_alpha in ccp_alpha_Array:
    
    # Create Decision Tree classifer object
    clf = DecisionTreeClassifier(criterion='gini',
                                 max_depth=5,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 ccp_alpha=ccp_alpha)
    # Train Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    
    #Predict the response for test dataset
    y_pred = clf.predict(X_test)
    Accuracy = metrics.accuracy_score(y_test, y_pred)
    Precision = metrics.precision_score(y_test, y_pred)
    Recall = metrics.recall_score(y_test, y_pred)
    Acc_array.append(Accuracy)
    Prec_array.append(Precision)
    Rec_array.append(Recall)

In [None]:
w, h = 1.0*plt.figaspect(0.9)
fig1 = plt.figure(figsize=(w,h))
fig1.subplots_adjust(left=0.16, bottom=0.16, right=0.98, top=0.95,wspace=0.00, hspace=0.05)
ax = fig1.gca()


plt.plot(ccp_alpha_Array,Acc_array,'-',label='Accuracy')
plt.plot(ccp_alpha_Array,Prec_array,'-',label='Precision')
plt.plot(ccp_alpha_Array,Rec_array,'-',label='Recall')
plt.xlabel('ccp_alpha')
plt.ylabel('Métricas')
plt.legend(loc=0)
# plt.savefig('Toomre_v1.pdf')
plt.show()
print('ccp_alpha:',ccp_alpha_Array[np.argmax(Acc_array)])

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion='gini',
                             max_depth=5,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             ccp_alpha=0.008)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
print(GetMetrics(y_test, y_pred,clf,'CM_gini_2.pdf'))

In [None]:
dot_data = StringIO()

export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = columns_2_use,class_names=['0','1'],proportion=True,rotate=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('Tree2.png')
Image(graph.create_png())

In [None]:
clf = RandomForestClassifier(n_estimators=100,criterion='gini',
                             max_depth=5,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             ccp_alpha=0.008)
# Train Decision Tree Classifer
with parallel_backend('threading', n_jobs=4):
    clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
Accuracy = metrics.accuracy_score(y_test, y_pred)
Precision = metrics.precision_score(y_test, y_pred)
Recall = metrics.recall_score(y_test, y_pred)
print("Accuracy:",Accuracy)
print("Precision:",Precision)
print("Recall:",Recall)

In [None]:
columns = X.columns.tolist()
columns = np.array(columns)
Importance = clf.feature_importances_


print(np.around(Importance,1),columns)
columns_2_use = np.array(columns)
# print(columns[np.around(Importance,1)>0])

columns = columns[np.argsort(Importance)[::-1]]
Importance = Importance[np.argsort(Importance)[::-1]]
for i in range(len(Importance)):
    if round(Importance[i],1)>0:
        print(columns[i],round(Importance[i],3))