### Bibliotecas

In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from IPython import display

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.tree import export_graphviz

from pandas.plotting import scatter_matrix

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Import do Dataset

In [4]:
col_names = ['Class', 'Specimen_Number', 'Eccentricity',
             'Aspect_Radio', 'Elongation', 'Solidity',
             'Stochastic_Convexity', 'Isoperimetric_Factor', 'Maximal_Identation_Depth',
             'Lobedness', 'Average_Intensity', 'Average_Contrast', 'Smoothness',
             'Third_Moment', 'Uniformity', 'Entropy']

df = pd.read_csv('dados/leaf.csv', header=None, names=col_names)
#df = pd.read_csv('dados/leaf.csv', header=None)

In [5]:
df.head()

Unnamed: 0,Class,Specimen_Number,Eccentricity,Aspect_Radio,Elongation,Solidity,Stochastic_Convexity,Isoperimetric_Factor,Maximal_Identation_Depth,Lobedness,Average_Intensity,Average_Contrast,Smoothness,Third_Moment,Uniformity,Entropy
0,1,1,0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.004657,0.003947,0.04779,0.12795,0.016108,0.005232,0.000275,1.1756
1,1,2,0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.005242,0.005002,0.02416,0.090476,0.008119,0.002708,7.5e-05,0.69659
2,1,3,0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.007457,0.010121,0.011897,0.057445,0.003289,0.000921,3.8e-05,0.44348
3,1,4,0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.006877,0.008607,0.01595,0.065491,0.004271,0.001154,6.6e-05,0.58785
4,1,5,0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.007938,0.045339,0.002051,0.00056,2.4e-05,0.34214


In [6]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rfc = RandomForestClassifier(random_state=42)

# Dictionary with parameters names (string) as keys and lists of parameter settings to try as values,
# or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored.
# This enables searching over any sequence of parameter settings.

max_depth = [18, 20, 26, 32]
max_depth.append(None)

param_grid = {
    # The number of trees in the forest
    'n_estimators': [150, 200, 250, 300, 350],
    
    # The function to measure the quality of a split
    'criterion' :['gini', 'entropy'],
    
    # The maximum depth of the tree
    'max_depth' : max_depth,
    
    # The number of features to consider when looking for the best split
    'max_features': ['auto', 'sqrt', 'log2']
}

scoring = ['accuracy', 'f1_micro', 'f1_macro']

CV_rfc = GridSearchCV(estimator=rfc,
                      scoring=scoring,
                      n_jobs=-1, # Number of jobs to run in parallel (-1 == All)
                      param_grid=param_grid,
                      verbose=True,
                      refit='accuracy',
                      cv=10, # 10-Fold Cross-Validation
                      return_train_score=False)
CV_rfc.fit(X_train, y_train)

df2 = pd.DataFrame(CV_rfc.cv_results_)
df2.to_csv('resultado.csv')

Fitting 10 folds for each of 150 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  5.6min finished


In [8]:
# Get the best parameters
print ('Best parameters:', CV_rfc.best_params_)
print ('Best Score:', CV_rfc.best_score_)

Best parameters: {'criterion': 'gini', 'max_depth': 18, 'max_features': 'auto', 'n_estimators': 200}
Best Score: 0.7647058823529411


In [9]:
rfc1=RandomForestClassifier(random_state=42,
                            max_features='auto',
                            n_estimators= 200,
                            max_depth=18,
                            criterion='gini')

In [10]:
rfc1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=18, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [11]:
pred=rfc1.predict(X_test)

In [12]:
print("Accuracy for Random Forest on CV data: ", accuracy_score(y_test,pred))

Accuracy for Random Forest on CV data:  0.75


In [13]:
df2.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_criterion', 'param_max_depth', 'param_max_features',
       'param_n_estimators', 'params', 'split0_test_accuracy',
       'split1_test_accuracy', 'split2_test_accuracy', 'split3_test_accuracy',
       'split4_test_accuracy', 'split5_test_accuracy', 'split6_test_accuracy',
       'split7_test_accuracy', 'split8_test_accuracy', 'split9_test_accuracy',
       'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy',
       'split0_test_f1_micro', 'split1_test_f1_micro', 'split2_test_f1_micro',
       'split3_test_f1_micro', 'split4_test_f1_micro', 'split5_test_f1_micro',
       'split6_test_f1_micro', 'split7_test_f1_micro', 'split8_test_f1_micro',
       'split9_test_f1_micro', 'mean_test_f1_micro', 'std_test_f1_micro',
       'rank_test_f1_micro', 'split0_test_f1_macro', 'split1_test_f1_macro',
       'split2_test_f1_macro', 'split3_test_f1_macro', 'split4_test_f1_macro',
       'split5_te

In [14]:
df2.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_accuracy,...,split3_test_f1_macro,split4_test_f1_macro,split5_test_f1_macro,split6_test_f1_macro,split7_test_f1_macro,split8_test_f1_macro,split9_test_f1_macro,mean_test_f1_macro,std_test_f1_macro,rank_test_f1_macro
0,0.377268,0.019425,0.070951,0.007898,gini,18,auto,150,"{'criterion': 'gini', 'max_depth': 18, 'max_fe...",0.717949,...,0.711111,0.65,0.638889,0.716049,0.84058,0.733333,0.487179,0.688505,0.081638,148
1,0.479241,0.04872,0.11118,0.030212,gini,18,auto,200,"{'criterion': 'gini', 'max_depth': 18, 'max_fe...",0.717949,...,0.755556,0.683333,0.638889,0.709877,0.826087,0.8,0.487179,0.702843,0.081249,16
2,0.595924,0.02383,0.113382,0.00965,gini,18,auto,250,"{'criterion': 'gini', 'max_depth': 18, 'max_fe...",0.717949,...,0.755556,0.683333,0.644444,0.709877,0.84058,0.8,0.487179,0.704682,0.082722,1
3,0.702399,0.025024,0.146004,0.014198,gini,18,auto,300,"{'criterion': 'gini', 'max_depth': 18, 'max_fe...",0.717949,...,0.711111,0.683333,0.644444,0.709877,0.84058,0.733333,0.487179,0.696103,0.078107,46
4,0.868317,0.031569,0.161315,0.015482,gini,18,auto,350,"{'criterion': 'gini', 'max_depth': 18, 'max_fe...",0.74359,...,0.711111,0.683333,0.644444,0.716049,0.84058,0.733333,0.47619,0.699826,0.074015,31


In [15]:
str_tree = export_graphviz(rfc1.estimators_[0], 
   out_file='tree.dot', 
   feature_names=X_train.columns, # column names
   filled=True,        
   special_characters=True)

os.system('dot -Tpng tree.dot -o tree.png')
# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

FileNotFoundError: [Errno 2] No such file or directory: 'tree.png'