# Training and comparison

Objective: Trainining of multiple models using the data and comparison of the results

In [1]:
#Importing most used modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import phik
sns.set_palette('viridis')
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error, r2_score, ConfusionMatrixDisplay
from yellowbrick.classifier import ConfusionMatrix
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier

Loading the datasets

In [2]:
#Loading the datasets
test_sets = ['numeric','categoricals_binned','one_hot_encoded','outliers_removed',
             'one_hot_encoded_rescaled','oversampled+','oversampled-','smoted+','smoted-']
datasets = {}
for key in test_sets:
    datasets[key] = pd.read_csv(f'datasets/{key}.csv')

In [3]:
#Defining the X,y spliter for ease of use
def df_splitter(dataset, target = 'Response'):
    X = dataset.drop(columns=[target])
    y = dataset[target].values
    return X, y

In [4]:
# Defining the printing function
def display_scores(scores):
    print("Scores:", scores)
    print("\nMean:", scores.mean())
    print("Standard deviation:", scores.std())

## Models

## Dataset tests
Since we have some datasets options, we will try a first run of t

In [5]:
first_results = []

#### K-Neighbors Classifier

In [6]:
knn_clf = KNeighborsClassifier(n_neighbors=5, n_jobs = -1)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    knn_test = cross_val_score(knn_clf,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy')
    first_results.append({'model':'KNN', 'dataset':key, 'score':knn_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(knn_test)


-----
Dataset: numeric
Scores: [0.71228536 0.71211675 0.71491745]

Mean: 0.7131065181467098
Standard deviation: 0.0012823729935557734

-----
Dataset: categoricals_binned
Scores: [0.71059032 0.70769458 0.71005307]

Mean: 0.7094459858904245
Standard deviation: 0.0012577064482832434

-----
Dataset: one_hot_encoded
Scores: [0.71022183 0.70850531 0.70953715]

Mean: 0.7094214273332063
Standard deviation: 0.0007055284432189075

-----
Dataset: outliers_removed
Scores: [0.71475655 0.70724399 0.71301221]

Mean: 0.711670917757777
Standard deviation: 0.00321029198290776

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.71464367 0.71395932 0.71587559]

Mean: 0.7148261929129059
Standard deviation: 0.0007928897487709854

-----
Dataset: oversampled+
Scores: [0.62038204 0.6198303  0.6208    ]

Mean: 0.620337448371037
Standard deviation: 0.0003971309629555497

-----
Dataset: oversampled-
Scores: [0.61266322 0.611905   0.6137282 ]

Mean: 0.6127654698725807
Standard deviation: 0.0007478214047553315

--

#### Naive Bayes (Gaussian)

In [7]:
nbg_clf = GaussianNB()
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    nbg_test = cross_val_score(nbg_clf,X_train,y_train, n_jobs=-1, cv=5, scoring='accuracy')
    first_results.append({'model':'Naive Bayes', 'dataset':key, 'score':nbg_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(nbg_test)


-----
Dataset: numeric
Scores: [0.75592679 0.74695983 0.74462597 0.75113622 0.74695983]

Mean: 0.7491217295172583
Standard deviation: 0.003998162378626328

-----
Dataset: categoricals_binned
Scores: [0.75690947 0.74953937 0.74376612 0.75224174 0.746837  ]

Mean: 0.749858739712566
Standard deviation: 0.004510973372125921

-----
Dataset: one_hot_encoded
Scores: [0.75506695 0.74941653 0.7443803  0.7492937  0.74867952]

Mean: 0.7493673995823609
Standard deviation: 0.003401088891060924

-----
Dataset: outliers_removed
Scores: [0.75418227 0.75156055 0.74956305 0.75402672 0.7445374 ]

Mean: 0.7507739966071254
Standard deviation: 0.003554625483263591

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.72583221 0.72976293 0.72816607 0.71919912 0.72288417]

Mean: 0.725168898169758
Standard deviation: 0.003778530191324014

-----
Dataset: oversampled+
Scores: [0.52828054 0.53139394 0.53373737 0.52711111 0.52864646]

Mean: 0.5298338863750629
Standard deviation: 0.0024044679324950508

-----
Dataset

#### Support Vector Machine

In [9]:
svm = SVC(random_state = 1216, C = 1.0, cache_size = 4096)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    svc_test = cross_val_score(svm,X_train,y_train, n_jobs=-1, cv=2, scoring='accuracy')
    first_results.append({'model':'Support Vector Machine', 'dataset':key, 'score':svc_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(svc_test)


-----
Dataset: numeric
Scores: [0.76037931 0.76036753]

Mean: 0.7603734183543323
Standard deviation: 5.88690779129708e-06

-----
Dataset: categoricals_binned
Scores: [0.76037931 0.76036753]

Mean: 0.7603734183543323
Standard deviation: 5.88690779129708e-06

-----
Dataset: one_hot_encoded
Scores: [0.76037931 0.76036753]

Mean: 0.7603734183543323
Standard deviation: 5.88690779129708e-06

-----
Dataset: outliers_removed
Scores: [0.76038753 0.76038753]

Mean: 0.7603875349580503
Standard deviation: 0.0

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.76037931 0.76036753]

Mean: 0.7603734183543323
Standard deviation: 5.88690779129708e-06

-----
Dataset: oversampled+
Scores: [0.5166462  0.53096516]

Mean: 0.5238056758678648
Standard deviation: 0.007159480250824246

-----
Dataset: oversampled-
Scores: [0.50775245 0.50517394]

Mean: 0.5064631951570698
Standard deviation: 0.0012892521197188644

-----
Dataset: smoted+
Scores: [0.67318508 0.66920939]

Mean: 0.6711972331760294
Standard deviatio

#### Decision Trees

In [12]:
dtc = DecisionTreeClassifier(random_state=1216)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    dtc_test = cross_val_score(dtc,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy') #Default cv value = 5
    first_results.append({'model':'Decision Tree', 'dataset':key, 'score':dtc_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(dtc_test)


-----
Dataset: numeric
Scores: [0.66195003 0.66384139 0.66376769]

Mean: 0.6631863711175029
Standard deviation: 0.0008747406010030579

-----
Dataset: categoricals_binned
Scores: [0.63026015 0.63509729 0.63760318]

Mean: 0.6343202078382513
Standard deviation: 0.003047722663274115

-----
Dataset: one_hot_encoded
Scores: [0.64713686 0.64725825 0.64954304]

Mean: 0.6479793844129307
Standard deviation: 0.0011067834046584052

-----
Dataset: outliers_removed
Scores: [0.64539326 0.64281969 0.6404225 ]

Mean: 0.6428784829510718
Standard deviation: 0.002029728015538005

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.64706316 0.64718455 0.64946934]

Mean: 0.647905683393311
Standard deviation: 0.0011067820263821067

-----
Dataset: oversampled+
Scores: [0.77557452 0.77260606 0.7785697 ]

Mean: 0.7755834250583015
Standard deviation: 0.0024346524976599873

-----
Dataset: oversampled-
Scores: [0.7694506  0.77303637 0.77313492]

Mean: 0.7718739619164352
Standard deviation: 0.00171404536073089

---

#### Extra Trees

In [15]:
extrees = ExtraTreesClassifier(n_jobs=-1, random_state=1216)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    extrees_test = cross_val_score(extrees,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy') #Default cv value = 5
    first_results.append({'model':'Extra Trees', 'dataset':key, 'score':extrees_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(extrees_test)


-----
Dataset: numeric
Scores: [0.74839708 0.74985259 0.75081073]

Mean: 0.7496868023515727
Standard deviation: 0.0009923175815198183

-----
Dataset: categoricals_binned
Scores: [0.73793205 0.73776533 0.73540684]

Mean: 0.7370347402699734
Standard deviation: 0.0011531100974619445

-----
Dataset: one_hot_encoded
Scores: [0.71648611 0.72435142 0.72199292]

Mean: 0.7209434825543427
Standard deviation: 0.0032956294397047925

-----
Dataset: outliers_removed
Scores: [0.71857678 0.72222638 0.71848079]

Mean: 0.7197613160341841
Standard deviation: 0.0017435067655318006

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.71648611 0.72435142 0.72199292]

Mean: 0.7209434825543427
Standard deviation: 0.0032956294397047925

-----
Dataset: oversampled+
Scores: [0.83142636 0.83515152 0.83369697]

Mean: 0.8334249466447662
Standard deviation: 0.0015329061255170064

-----
Dataset: oversampled-
Scores: [0.83439271 0.82925988 0.83753819]

Mean: 0.8337302586526795
Standard deviation: 0.003411913276928968


#### Random Forest

In [18]:
rnd_clf = RandomForestClassifier(n_estimators=70,
                                 max_leaf_nodes=None,
                                 bootstrap=True,
                                 oob_score=True,n_jobs = -1,
                                 random_state = 1216)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    rnd_clf_test = cross_val_score(rnd_clf,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy')
    first_results.append({'model':'Random Forest', 'dataset':key, 'score':rnd_clf_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(rnd_clf_test)


-----
Dataset: numeric
Scores: [0.75488245 0.75670696 0.7550855 ]

Mean: 0.7555583018265367
Standard deviation: 0.0008164410781646076

-----
Dataset: categoricals_binned
Scores: [0.7478812  0.7499263  0.74985259]

Mean: 0.7492200304343605
Standard deviation: 0.0009471742677313469

-----
Dataset: one_hot_encoded
Scores: [0.74471221 0.75051592 0.75014741]

Mean: 0.7484585123768754
Standard deviation: 0.002653303283560516

-----
Dataset: outliers_removed
Scores: [0.74883895 0.75016855 0.75031838]

Mean: 0.7497752930568754
Standard deviation: 0.0006649128782569847

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.74478591 0.75051592 0.7500737 ]

Mean: 0.7484585105663065
Standard deviation: 0.002603189114267456

-----
Dataset: oversampled+
Scores: [0.82846892 0.83612121 0.8326303 ]

Mean: 0.8324068126234713
Standard deviation: 0.003128028585296697

-----
Dataset: oversampled-
Scores: [0.83178123 0.82743668 0.82970336]

Mean: 0.8296404227649076
Standard deviation: 0.0017742117696808272

-

#### AdaBoost

In [22]:
ada_boost = AdaBoostClassifier(n_estimators = 500, learning_rate = 0.8, random_state = 1216)
for key in test_sets:
    X, y = df_splitter(datasets[key])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)
    ada_boost_test = cross_val_score(ada_boost,X_train,y_train, n_jobs=-1, cv=3, scoring='accuracy')
    first_results.append({'model':'AdaBoost', 'dataset':key, 'score':ada_boost_test.mean()})
    print(f'\n-----\nDataset: {key}')
    display_scores(ada_boost_test)


-----
Dataset: numeric
Scores: [0.75945169 0.75980248 0.75906545]

Mean: 0.7594398719611991
Standard deviation: 0.0003010065924008337

-----
Dataset: categoricals_binned
Scores: [0.75996757 0.75980248 0.75994988]

Mean: 0.7599066438784113
Standard deviation: 7.401075954548328e-05

-----
Dataset: one_hot_encoded
Scores: [0.75989388 0.75980248 0.75987618]

Mean: 0.7598575104688545
Standard deviation: 3.957989254669246e-05

-----
Dataset: outliers_removed
Scores: [0.76022472 0.76028167 0.76020676]

Mean: 0.760237715066819
Standard deviation: 3.193349830681007e-05

-----
Dataset: one_hot_encoded_rescaled
Scores: [0.75989388 0.75980248 0.75994988]

Mean: 0.7598820780789174
Standard deviation: 6.0753578180248605e-05

-----
Dataset: oversampled+
Scores: [0.56530592 0.56213333 0.55873939]

Mean: 0.5620595506113203
Standard deviation: 0.0026812825313568974

-----
Dataset: oversampled-
Scores: [0.56132052 0.55627279 0.55533655]

Mean: 0.5576432883196627
Standard deviation: 0.0026281390582369975

#### Visualizing the results

In [103]:
results = first_results
results = pd.DataFrame(results)
results = results.pivot(index='model', columns = 'dataset', values = 'score')
results = results.reindex(columns= [ 'numeric','categoricals_binned','one_hot_encoded','one_hot_encoded_rescaled','outliers_removed','oversampled+','oversampled-','smoted+','smoted-'])
results = results.reindex(['KNN', 'Naive Bayes', 'Support Vector Machine', 'Decision Tree', 'Extra Trees', 'Random Forest', 'AdaBoost'])
results.style.background_gradient(cmap='bwr_r', low=0.7, high=0.8, axis=0)

dataset,numeric,categoricals_binned,one_hot_encoded,one_hot_encoded_rescaled,outliers_removed,oversampled+,oversampled-,smoted+,smoted-
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
KNN,0.713107,0.709446,0.709421,0.714826,0.711671,0.620337,0.612765,0.616281,0.651036
Naive Bayes,0.749122,0.749859,0.749367,0.725169,0.750774,0.529834,0.534287,0.735228,0.634644
Support Vector Machine,0.760373,0.760373,0.760373,0.760373,0.760388,0.523806,0.506463,0.671197,0.505067
Decision Tree,0.663186,0.63432,0.647979,0.647906,0.642878,0.775583,0.771874,0.691771,0.738548
Extra Trees,0.749687,0.737035,0.720943,0.720943,0.719761,0.833425,0.83373,0.758517,0.786722
Random Forest,0.755558,0.74922,0.748459,0.748459,0.749775,0.832407,0.82964,0.770735,0.805545
AdaBoost,0.75944,0.759907,0.759858,0.759882,0.760238,0.56206,0.557643,0.755899,0.817929


### Improving the models

Reanalizing the best models

#### Extra Trees
'oversampled-' dataset

In [294]:
#Preparing the classifier
extra_tree = ExtraTreesClassifier(random_state=1216, n_jobs = -1)
X, y = df_splitter(datasets['oversampled-'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1216)

In [295]:
#Setting the parameters for the grid search
alg = ['gini','entropy']
n_estimators= np.array([100,300,500,1000])
min_samples_split = np.array([2,4,6,8,10,12,20,30,40])
max_depth = np.array([3,7,11,15,20,30,50,100,None])
max_leaf_nodes = np.array([2,3,4,5,None])
values_grid = {'min_samples_split': min_samples_split,
               'max_depth': max_depth,
               'max_leaf_nodes':max_leaf_nodes,
               'n_estimators':n_estimators,
               'criterion':alg}

In [296]:
gridDecisionTree = GridSearchCV(estimator = extra_tree,
                                param_grid = values_grid,
                                cv = 3, verbose = 3,
                                scoring='accuracy',
                                pre_dispatch=-1,
                                return_train_score= True)

In [297]:
gridDecisionTree.fit(X_train, y_train);

Fitting 3 folds for each of 3240 candidates, totalling 9720 fits
[CV 1/3] END criterion=gini, max_depth=3, max_leaf_nodes=2, min_samples_split=2, n_estimators=100;, score=(train=0.548, test=0.554) total time=   0.5s
[CV 2/3] END criterion=gini, max_depth=3, max_leaf_nodes=2, min_samples_split=2, n_estimators=100;, score=(train=0.552, test=0.548) total time=   0.4s
[CV 3/3] END criterion=gini, max_depth=3, max_leaf_nodes=2, min_samples_split=2, n_estimators=100;, score=(train=0.551, test=0.546) total time=   0.5s
[CV 1/3] END criterion=gini, max_depth=3, max_leaf_nodes=2, min_samples_split=2, n_estimators=300;, score=(train=0.539, test=0.539) total time=   0.9s
[CV 2/3] END criterion=gini, max_depth=3, max_leaf_nodes=2, min_samples_split=2, n_estimators=300;, score=(train=0.538, test=0.533) total time=   1.0s
[CV 3/3] END criterion=gini, max_depth=3, max_leaf_nodes=2, min_samples_split=2, n_estimators=300;, score=(train=0.538, test=0.538) total time=   0.9s
[CV 1/3] END criterion=gini, 

In [298]:
pd.DataFrame(gridDecisionTree.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_leaf_nodes,param_min_samples_split,param_n_estimators,params,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.486187,0.050490,0.061965,0.002828,gini,3,2,2,100,"{'criterion': 'gini', 'max_depth': 3, 'max_lea...",...,0.548487,0.546467,0.549628,0.003152,2539,0.548438,0.551750,0.551159,0.550449,0.001443
1,0.908836,0.003687,0.139053,0.002930,gini,3,2,2,300,"{'criterion': 'gini', 'max_depth': 3, 'max_lea...",...,0.533163,0.538238,0.536784,0.002577,3079,0.539002,0.537806,0.538028,0.538278,0.000519
2,1.400890,0.120565,0.217112,0.002540,gini,3,2,2,500,"{'criterion': 'gini', 'max_depth': 3, 'max_lea...",...,0.534148,0.543658,0.539346,0.003933,2917,0.539889,0.538914,0.545714,0.541506,0.003002
3,2.419742,0.008181,0.410752,0.004100,gini,3,2,2,1000,"{'criterion': 'gini', 'max_depth': 3, 'max_lea...",...,0.535183,0.545679,0.542680,0.005337,2755,0.545851,0.536820,0.550149,0.544273,0.005555
4,0.415242,0.007581,0.058583,0.003641,gini,3,2,4,100,"{'criterion': 'gini', 'max_depth': 3, 'max_lea...",...,0.548487,0.546467,0.549628,0.003152,2539,0.548438,0.551750,0.551159,0.550449,0.001443
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3235,17.940001,1.084930,1.759000,0.019252,entropy,,,30,1000,"{'criterion': 'entropy', 'max_depth': None, 'm...",...,0.710456,0.715088,0.715421,0.004197,278,0.872401,0.873020,0.874744,0.873388,0.000992
3236,1.828841,0.059312,0.163665,0.006944,entropy,,,40,100,"{'criterion': 'entropy', 'max_depth': None, 'm...",...,0.692766,0.695624,0.697025,0.004169,336,0.843328,0.840646,0.843455,0.842477,0.001295
3237,5.216334,0.529912,0.487334,0.039348,entropy,,,40,300,"{'criterion': 'entropy', 'max_depth': None, 'm...",...,0.692963,0.696905,0.697321,0.003739,331,0.843402,0.840228,0.844515,0.842715,0.001816
3238,9.189730,0.343781,0.804331,0.052639,entropy,,,40,500,"{'criterion': 'entropy', 'max_depth': None, 'm...",...,0.693998,0.697349,0.698044,0.003620,326,0.844092,0.842026,0.845820,0.843979,0.001551


In [299]:
best = gridDecisionTree.best_estimator_

In [300]:
best.fit(X_train, y_train);

In [301]:
teste = best.predict(X_test)

In [302]:
print(classification_report(y_test, teste))

              precision    recall  f1-score   support

           0       0.93      0.88      0.90      7581
           1       0.88      0.93      0.91      7640

    accuracy                           0.90     15221
   macro avg       0.91      0.90      0.90     15221
weighted avg       0.91      0.90      0.90     15221



#### Random Forest

In [303]:
rnd_clf = RandomForestClassifier(bootstrap=True,
                                 oob_score=True,
                                 n_jobs = -1)
X, y = df_splitter(datasets['oversampled+'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [304]:
# Grid search parameters
n_estimators = np.array([70,100,200,500,100,2000])
max_depth = np.array([3,7,9,15,30,50,None])
max_leaf_nodes = np.array([2,5,7,8,9,15,None])
alg = np.array(['entropy', 'gini'])
values_grid = {'n_estimators': n_estimators, 'max_depth': max_depth, 'max_leaf_nodes':max_leaf_nodes, 'criterion':alg}

In [305]:
gridDecisionTree = GridSearchCV(estimator = rnd_clf,
                                param_grid = values_grid,
                                cv = 2, verbose = 3,
                                scoring='accuracy',
                                pre_dispatch=-1,
                                return_train_score= True)

In [None]:
gridDecisionTree.fit(X_train, y_train);

Fitting 2 folds for each of 588 candidates, totalling 1176 fits
[CV 1/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=70;, score=(train=0.537, test=0.529) total time=   0.7s
[CV 2/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=70;, score=(train=0.552, test=0.549) total time=   0.6s
[CV 1/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=100;, score=(train=0.551, test=0.551) total time=   0.9s
[CV 2/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=100;, score=(train=0.552, test=0.551) total time=   0.9s
[CV 1/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=200;, score=(train=0.541, test=0.534) total time=   1.7s
[CV 2/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=200;, score=(train=0.551, test=0.548) total time=   1.6s
[CV 1/2] END criterion=entropy, max_depth=3, max_leaf_nodes=2, n_estimators=500;, score=(train=0.548, test=0.542) total time=   4.0s
[CV 2/2

In [None]:
pd.DataFrame(gridDecisionTree.cv_results_)

In [None]:
best = gridDecisionTree.best_estimator_

In [None]:
best.fit(X_train, y_train);

In [None]:
teste = best.predict(X_test)

In [None]:
print(classification_report(y_test, teste))

#### Adaboost

adaboost - smote-



In [None]:
ada_boost = AdaBoostClassifier(random_state = 1216)
X, y = df_splitter(datasets['smoted-'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Grid search parameters
n_estimators = np.array([70,100,150,200,300,500,600])
learning_rate = np.array([0.5,1,1.5,2,3,5,10])
values_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate}

In [None]:
gridDecisionTree = GridSearchCV(estimator = ada_boost,
                                param_grid = values_grid,
                                cv = 2, verbose = 3,
                                scoring='accuracy',
                                pre_dispatch=-1,
                                return_train_score= True)

In [None]:
gridDecisionTree.fit(X_train, y_train);

In [None]:
pd.DataFrame(gridDecisionTree.cv_results_)

In [None]:
best = gridDecisionTree.best_estimator_

In [None]:
best.fit(X_train, y_train);

In [None]:
teste = best.predict(X_test)

In [None]:
print(classification_report(y_test, teste))