In [2]:
import warnings
warnings.filterwarnings("ignore")

In [42]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [4]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [5]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

### upload and divide data

In [39]:
#d = pd.read_csv("data/03-pico-sem-town-shortname.csv")
#d = pd.read_csv("data/03-idf.csv")
#d = pd.read_csv("data/03-idf.csv")
d = pd.read_csv("data/03-idf-completo-fixedq.csv")
#d=d.dropna()

In [None]:
# sample data? faster models
#d = d.sample(frac=0.01)

In [8]:
d.isna().sum().sum()

68

In [12]:
X, y = d.drop("target", axis=1), d["target"]

### prepare for cross validation and/or train/test split

In [13]:
# cross-validation splitting strategy
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [14]:
# train/test split
# https://towardsdatascience.com/understanding-the-confusion-matrix-and-how-to-implement-it-in-python-319202e0fe4d
test_size = 0.33
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = test_size, random_state=1234, stratify=y)

# Modelos

### demo_python

- ## decision tree

In [None]:
dt = DecisionTreeClassifier(max_depth=25, random_state=1234)

cross validate

In [None]:
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

train/test split

In [None]:
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
dt_f1 = (2*tp)/(2*tp+fp+fn)
print(dt_f1)

- ## neural network

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)

cross validate

In [None]:
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

train/test split

In [None]:
nn.fit(xtrain, ytrain)
ypred = nn.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nn_f1 = (2*tp)/(2*tp+fp+fn)
print(nn_f1)

- ## gaussian naive bayes

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

- ## support vector classification

In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

- ## nearest neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')

In [None]:
nbrs.fit(xtrain)
ypred = nbrs.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nbrs_f1 = (2*tp)/(2*tp+fp+fn)
print(nbrs_f1)

- ## nearest centroid

In [None]:
from sklearn.neighbors import NearestCentroid
nc = NearestCentroid()
scores_nc = cross_validate(nc, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_nc)
pd.DataFrame(nc_scores.mean()).T

### ensembles

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

- ## random forest

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, random_state=0)

cross validate

In [None]:
scores_clf = cross_validate(clf, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_clf)
pd.DataFrame(clf_scores.mean()).T

train/test split

In [None]:
clf.fit(xtrain, ytrain)
ypred = clf.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
clf_f1 = (2*tp)/(2*tp+fp+fn)
print(clf_f1)

- ## adaboost

In [None]:
ada = AdaBoostClassifier(n_estimators=100)

cross validate

In [None]:
scores_ada = cross_validate(ada, X, y, cv=splitter, scoring=METRICS)
ada_scores = pd.DataFrame(scores_ada)
pd.DataFrame(ada_scores.mean()).T

train/test split

In [None]:
ada.fit(xtrain, ytrain)
ypred = ada.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ada_f1 = (2*tp)/(2*tp+fp+fn)
print(ada_f1)

- ## gradient boost

In [None]:
gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=1.0, max_depth=20, random_state=0).fit(X, y)
scores_gbc = cross_validate(gbc, X, y, cv=splitter, scoring=METRICS)
gbc_scores = pd.DataFrame(scores_gbc)
pd.DataFrame(gbc_scores.mean()).T

### tpot automl

- ## xgboost

In [19]:
from xgboost import XGBClassifier

In [16]:
xgb = XGBClassifier(objective = 'binary:logistic',seed=42)
#learning_rate=0.1, max_depth=25, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0,

cross-validate

In [None]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

train/test split

In [21]:
# train/test
#xgb.fit(xtrain, ytrain, verbose=True, early_stopping_rounds=10, eval_metric='map', eval_set=[(xtest,ytest)])
#ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

0.7544177720659109


### xgboost youtube
- https://www.youtube.com/watch?v=ap2SS0-XPcE

In [34]:
model_xgboost = XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='map',
                                      verbosity=1)

eval_set = [(xtest, ytest)]

model_xgboost.fit(xtrain,
                  ytrain,
                  early_stopping_rounds=10,
                  eval_set=eval_set,
                  verbose=True)

[0]	validation_0-map:0.70657
[1]	validation_0-map:0.72107
[2]	validation_0-map:0.73143
[3]	validation_0-map:0.73398
[4]	validation_0-map:0.74036
[5]	validation_0-map:0.74520
[6]	validation_0-map:0.74652
[7]	validation_0-map:0.75027
[8]	validation_0-map:0.75119
[9]	validation_0-map:0.75281
[10]	validation_0-map:0.75338
[11]	validation_0-map:0.75364
[12]	validation_0-map:0.75461
[13]	validation_0-map:0.75471
[14]	validation_0-map:0.75567
[15]	validation_0-map:0.75734
[16]	validation_0-map:0.75805
[17]	validation_0-map:0.75798
[18]	validation_0-map:0.75773
[19]	validation_0-map:0.75740
[20]	validation_0-map:0.75747
[21]	validation_0-map:0.75786
[22]	validation_0-map:0.75930
[23]	validation_0-map:0.76025
[24]	validation_0-map:0.76058
[25]	validation_0-map:0.76172
[26]	validation_0-map:0.76266
[27]	validation_0-map:0.76306
[28]	validation_0-map:0.76361
[29]	validation_0-map:0.76382
[30]	validation_0-map:0.76430
[31]	validation_0-map:0.76496
[32]	validation_0-map:0.76502
[33]	validation_0-ma

[268]	validation_0-map:0.79758
[269]	validation_0-map:0.79758
[270]	validation_0-map:0.79761
[271]	validation_0-map:0.79761
[272]	validation_0-map:0.79764
[273]	validation_0-map:0.79768
[274]	validation_0-map:0.79775
[275]	validation_0-map:0.79782
[276]	validation_0-map:0.79787
[277]	validation_0-map:0.79793
[278]	validation_0-map:0.79796
[279]	validation_0-map:0.79802
[280]	validation_0-map:0.79805
[281]	validation_0-map:0.79809
[282]	validation_0-map:0.79815
[283]	validation_0-map:0.79819
[284]	validation_0-map:0.79823
[285]	validation_0-map:0.79829
[286]	validation_0-map:0.79832
[287]	validation_0-map:0.79837
[288]	validation_0-map:0.79838
[289]	validation_0-map:0.79840
[290]	validation_0-map:0.79842
[291]	validation_0-map:0.79846
[292]	validation_0-map:0.79848
[293]	validation_0-map:0.79852
[294]	validation_0-map:0.79855
[295]	validation_0-map:0.79857
[296]	validation_0-map:0.79862
[297]	validation_0-map:0.79862
[298]	validation_0-map:0.79865
[299]	validation_0-map:0.79872
[300]	va

[533]	validation_0-map:0.80368
[534]	validation_0-map:0.80368
[535]	validation_0-map:0.80369
[536]	validation_0-map:0.80370
[537]	validation_0-map:0.80371
[538]	validation_0-map:0.80373
[539]	validation_0-map:0.80377
[540]	validation_0-map:0.80377
[541]	validation_0-map:0.80378
[542]	validation_0-map:0.80379
[543]	validation_0-map:0.80382
[544]	validation_0-map:0.80383
[545]	validation_0-map:0.80384
[546]	validation_0-map:0.80384
[547]	validation_0-map:0.80386
[548]	validation_0-map:0.80389
[549]	validation_0-map:0.80391
[550]	validation_0-map:0.80392
[551]	validation_0-map:0.80393
[552]	validation_0-map:0.80394
[553]	validation_0-map:0.80395
[554]	validation_0-map:0.80397
[555]	validation_0-map:0.80399
[556]	validation_0-map:0.80401
[557]	validation_0-map:0.80400
[558]	validation_0-map:0.80405
[559]	validation_0-map:0.80405
[560]	validation_0-map:0.80407
[561]	validation_0-map:0.80409
[562]	validation_0-map:0.80412
[563]	validation_0-map:0.80414
[564]	validation_0-map:0.80413
[565]	va

[798]	validation_0-map:0.80657
[799]	validation_0-map:0.80657
[800]	validation_0-map:0.80658
[801]	validation_0-map:0.80659
[802]	validation_0-map:0.80660
[803]	validation_0-map:0.80661
[804]	validation_0-map:0.80664
[805]	validation_0-map:0.80664
[806]	validation_0-map:0.80666
[807]	validation_0-map:0.80666
[808]	validation_0-map:0.80668
[809]	validation_0-map:0.80669
[810]	validation_0-map:0.80670
[811]	validation_0-map:0.80671
[812]	validation_0-map:0.80671
[813]	validation_0-map:0.80672
[814]	validation_0-map:0.80672
[815]	validation_0-map:0.80672
[816]	validation_0-map:0.80672
[817]	validation_0-map:0.80672
[818]	validation_0-map:0.80673
[819]	validation_0-map:0.80673
[820]	validation_0-map:0.80674
[821]	validation_0-map:0.80675
[822]	validation_0-map:0.80676
[823]	validation_0-map:0.80676
[824]	validation_0-map:0.80676
[825]	validation_0-map:0.80676
[826]	validation_0-map:0.80676
[827]	validation_0-map:0.80677
[828]	validation_0-map:0.80677
[829]	validation_0-map:0.80678
[830]	va

[1061]	validation_0-map:0.80818
[1062]	validation_0-map:0.80819
[1063]	validation_0-map:0.80819
[1064]	validation_0-map:0.80820
[1065]	validation_0-map:0.80821
[1066]	validation_0-map:0.80821
[1067]	validation_0-map:0.80823
[1068]	validation_0-map:0.80824
[1069]	validation_0-map:0.80825
[1070]	validation_0-map:0.80826
[1071]	validation_0-map:0.80825
[1072]	validation_0-map:0.80826
[1073]	validation_0-map:0.80826
[1074]	validation_0-map:0.80826
[1075]	validation_0-map:0.80826
[1076]	validation_0-map:0.80826
[1077]	validation_0-map:0.80827
[1078]	validation_0-map:0.80827
[1079]	validation_0-map:0.80827
[1080]	validation_0-map:0.80828
[1081]	validation_0-map:0.80830
[1082]	validation_0-map:0.80832
[1083]	validation_0-map:0.80833
[1084]	validation_0-map:0.80833
[1085]	validation_0-map:0.80835
[1086]	validation_0-map:0.80835
[1087]	validation_0-map:0.80835
[1088]	validation_0-map:0.80836
[1089]	validation_0-map:0.80837
[1090]	validation_0-map:0.80837
[1091]	validation_0-map:0.80837
[1092]	v

[1318]	validation_0-map:0.80925
[1319]	validation_0-map:0.80926
[1320]	validation_0-map:0.80926
[1321]	validation_0-map:0.80926
[1322]	validation_0-map:0.80927
[1323]	validation_0-map:0.80927
[1324]	validation_0-map:0.80927
[1325]	validation_0-map:0.80927
[1326]	validation_0-map:0.80927
[1327]	validation_0-map:0.80927
[1328]	validation_0-map:0.80927
[1329]	validation_0-map:0.80927
[1330]	validation_0-map:0.80928
[1331]	validation_0-map:0.80928
[1332]	validation_0-map:0.80929
[1333]	validation_0-map:0.80928
[1334]	validation_0-map:0.80928
[1335]	validation_0-map:0.80929
[1336]	validation_0-map:0.80929
[1337]	validation_0-map:0.80929
[1338]	validation_0-map:0.80930
[1339]	validation_0-map:0.80930
[1340]	validation_0-map:0.80931
[1341]	validation_0-map:0.80931
[1342]	validation_0-map:0.80931
[1343]	validation_0-map:0.80932
[1344]	validation_0-map:0.80932
[1345]	validation_0-map:0.80933
[1346]	validation_0-map:0.80934
[1347]	validation_0-map:0.80935
[1348]	validation_0-map:0.80935
[1349]	v

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.5,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='map', feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=5, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=5000, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

In [35]:
y_train_pred = model_xgboost.predict(xtrain)
y_valid_pred = model_xgboost.predict(xtest)

#print("MAP Train: {:.4f}\nMAP Valid: {:.4f}".format(confusion_matrix(ytrain, y_train_pred),
#                                                    confusion_matrix(ytest, y_valid_pred)))

In [38]:
cm = f1_score(ytest,y_valid_pred)
#tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
#f1 = (2*tp)/(2*tp+fp+fn)
#print(f1)
print(cm)

0.7153612262676732


In [40]:
learning_rate_list = [0.02, 0.05, 0.1]
max_depth_list = [2, 3, 5]
n_estimators_list = [1000, 2000, 3000]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

27


{'learning_rate': [0.02, 0.05, 0.1],
 'max_depth': [2, 3, 5],
 'n_estimators': [1000, 2000, 3000]}

In [45]:
def my_f1_score(model, X, y): return f1_score(y, model.predict(X))

model_xgboost_hp = GridSearchCV(estimator=XGBClassifier(subsample=0.5,
                                                                colsample_bytree=0.25,
                                                                eval_metric='map',
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2,
                                scoring=my_f1_score,
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.641, test=0.633) total time=  43.5s
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=1000;, score=(train=0.649, test=0.649) total time=  42.4s
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.681, test=0.671) total time= 1.5min
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=2000;, score=(train=0.673, test=0.674) total time= 1.4min
[CV 1/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.690, test=0.679) total time= 2.1min
[CV 2/2] END learning_rate=0.02, max_depth=2, n_estimators=3000;, score=(train=0.684, test=0.685) total time= 2.2min
[CV 1/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.676, test=0.667) total time=  51.3s
[CV 2/2] END learning_rate=0.02, max_depth=3, n_estimators=1000;, score=(train=0.673, test=0.676) total time=  51.5s
[CV

GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.25,
                                     early_stopping_rounds=None,
                                     enable_categorical=False,
                                     eval_metric='map', feature_types=None,
                                     gamma=None, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None...
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                   

In [46]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

Unnamed: 0,rank_test_score,mean_test_score,mean_train_score,param_learning_rate,param_max_depth,param_n_estimators
25,1,0.704338,0.722704,0.1,5,2000
26,2,0.704107,0.727214,0.1,5,3000
23,3,0.703441,0.712279,0.1,3,3000
17,4,0.702788,0.719586,0.05,5,3000
16,5,0.701774,0.715282,0.05,5,2000
22,6,0.701316,0.70896,0.1,3,2000
24,7,0.70113,0.714925,0.1,5,1000
8,8,0.699658,0.709936,0.02,5,3000
14,9,0.698461,0.70635,0.05,3,3000
15,10,0.698224,0.707647,0.05,5,1000


In [47]:
# First sort by number of estimators as that would be x-axis
df_cv_results.sort_values(by='param_n_estimators', inplace=True)

# Find values of AUC for learning rate of 0.05 and different values of depth
lr_d2 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==2),:]
lr_d3 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==3),:]
lr_d5 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==5),:]
lr_d7 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==7),:]

# Let us plot now
fig, ax = plt.subplots(figsize=(10,5))
lr_d2.plot(x='param_n_estimators', y='mean_test_score', label='Depth=2', ax=ax)
lr_d3.plot(x='param_n_estimators', y='mean_test_score', label='Depth=3', ax=ax)
lr_d5.plot(x='param_n_estimators', y='mean_test_score', label='Depth=5', ax=ax)
lr_d7.plot(x='param_n_estimators', y='mean_test_score', label='Depth=7', ax=ax)
plt.ylabel('Mean Validation AUC')
plt.title('Performance wrt # of Trees and Depth')

NameError: name 'plt' is not defined

In [None]:
# First sort by learning rate as that would be x-axis
df_cv_results.sort_values(by='param_learning_rate', inplace=True)

# Find values of AUC for learning rate of 0.05 and different values of depth
lr_t3k_d2 = df_cv_results.loc[(df_cv_results['param_n_estimators']==3000) & (df_cv_results['param_max_depth']==2),:]

# Let us plot now
fig, ax = plt.subplots(figsize=(10,5))
lr_t3k_d2.plot(x='param_learning_rate', y='mean_test_score', label='Depth=2, Trees=3000', ax=ax)
plt.ylabel('Mean Validation AUC')
plt.title('Performance wrt learning rate')

# rank_test_score	mean_test_score	mean_train_score	param_learning_rate	param_max_depth	param_n_estimators
# 1	0.704338	0.722704	0.1	5	2000

In [50]:
model_xgboost_fin = XGBClassifier(learning_rate=0.1,
                                          max_depth=5,
                                          n_estimators=2000,
                                          subsample=0.5,
                                          colsample_bytree=0.25,
                                          eval_metric='map',
                                          verbosity=1,
                                          use_label_encoder=False)

# Passing both training and validation dataset as we want to plot AUC for both
eval_set = [(xtrain, ytrain),(xtest, ytest)]

model_xgboost_fin.fit(xtrain,
                  ytrain,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=True)

[0]	validation_0-map:0.56438	validation_1-map:0.56455
[1]	validation_0-map:0.57185	validation_1-map:0.57259
[2]	validation_0-map:0.57123	validation_1-map:0.57215
[3]	validation_0-map:0.65435	validation_1-map:0.65565
[4]	validation_0-map:0.66381	validation_1-map:0.66494
[5]	validation_0-map:0.71127	validation_1-map:0.71217
[6]	validation_0-map:0.73149	validation_1-map:0.73218
[7]	validation_0-map:0.73947	validation_1-map:0.73969
[8]	validation_0-map:0.73883	validation_1-map:0.73898
[9]	validation_0-map:0.73612	validation_1-map:0.73633
[10]	validation_0-map:0.73978	validation_1-map:0.73998
[11]	validation_0-map:0.73914	validation_1-map:0.73938
[12]	validation_0-map:0.74003	validation_1-map:0.74028
[13]	validation_0-map:0.74094	validation_1-map:0.74116
[14]	validation_0-map:0.74274	validation_1-map:0.74280
[15]	validation_0-map:0.74329	validation_1-map:0.74339
[16]	validation_0-map:0.74349	validation_1-map:0.74347
[17]	validation_0-map:0.74666	validation_1-map:0.74659
[18]	validation_0-ma

[149]	validation_0-map:0.78456	validation_1-map:0.78420
[150]	validation_0-map:0.78457	validation_1-map:0.78421
[151]	validation_0-map:0.78468	validation_1-map:0.78432
[152]	validation_0-map:0.78478	validation_1-map:0.78440
[153]	validation_0-map:0.78497	validation_1-map:0.78460
[154]	validation_0-map:0.78497	validation_1-map:0.78460
[155]	validation_0-map:0.78497	validation_1-map:0.78461
[156]	validation_0-map:0.78519	validation_1-map:0.78482
[157]	validation_0-map:0.78523	validation_1-map:0.78484
[158]	validation_0-map:0.78544	validation_1-map:0.78501
[159]	validation_0-map:0.78555	validation_1-map:0.78516
[160]	validation_0-map:0.78578	validation_1-map:0.78540
[161]	validation_0-map:0.78592	validation_1-map:0.78553
[162]	validation_0-map:0.78593	validation_1-map:0.78554
[163]	validation_0-map:0.78601	validation_1-map:0.78561
[164]	validation_0-map:0.78618	validation_1-map:0.78576
[165]	validation_0-map:0.78638	validation_1-map:0.78596
[166]	validation_0-map:0.78643	validation_1-map:

[296]	validation_0-map:0.79551	validation_1-map:0.79414
[297]	validation_0-map:0.79556	validation_1-map:0.79418
[298]	validation_0-map:0.79556	validation_1-map:0.79418
[299]	validation_0-map:0.79557	validation_1-map:0.79420
[300]	validation_0-map:0.79566	validation_1-map:0.79427
[301]	validation_0-map:0.79573	validation_1-map:0.79432
[302]	validation_0-map:0.79573	validation_1-map:0.79432
[303]	validation_0-map:0.79575	validation_1-map:0.79435
[304]	validation_0-map:0.79575	validation_1-map:0.79436
[305]	validation_0-map:0.79577	validation_1-map:0.79437
[306]	validation_0-map:0.79577	validation_1-map:0.79437
[307]	validation_0-map:0.79583	validation_1-map:0.79442
[308]	validation_0-map:0.79595	validation_1-map:0.79453
[309]	validation_0-map:0.79602	validation_1-map:0.79460
[310]	validation_0-map:0.79602	validation_1-map:0.79460
[311]	validation_0-map:0.79604	validation_1-map:0.79462
[312]	validation_0-map:0.79613	validation_1-map:0.79470
[313]	validation_0-map:0.79617	validation_1-map:

[443]	validation_0-map:0.80088	validation_1-map:0.79878
[444]	validation_0-map:0.80091	validation_1-map:0.79881
[445]	validation_0-map:0.80092	validation_1-map:0.79882
[446]	validation_0-map:0.80093	validation_1-map:0.79883
[447]	validation_0-map:0.80095	validation_1-map:0.79885
[448]	validation_0-map:0.80095	validation_1-map:0.79885
[449]	validation_0-map:0.80098	validation_1-map:0.79887
[450]	validation_0-map:0.80101	validation_1-map:0.79888
[451]	validation_0-map:0.80107	validation_1-map:0.79894
[452]	validation_0-map:0.80107	validation_1-map:0.79894
[453]	validation_0-map:0.80112	validation_1-map:0.79899
[454]	validation_0-map:0.80112	validation_1-map:0.79899
[455]	validation_0-map:0.80112	validation_1-map:0.79899
[456]	validation_0-map:0.80112	validation_1-map:0.79899
[457]	validation_0-map:0.80116	validation_1-map:0.79903
[458]	validation_0-map:0.80116	validation_1-map:0.79903
[459]	validation_0-map:0.80119	validation_1-map:0.79905
[460]	validation_0-map:0.80123	validation_1-map:

[590]	validation_0-map:0.80386	validation_1-map:0.80112
[591]	validation_0-map:0.80390	validation_1-map:0.80117
[592]	validation_0-map:0.80395	validation_1-map:0.80120
[593]	validation_0-map:0.80397	validation_1-map:0.80121
[594]	validation_0-map:0.80397	validation_1-map:0.80121
[595]	validation_0-map:0.80403	validation_1-map:0.80127
[596]	validation_0-map:0.80406	validation_1-map:0.80130
[597]	validation_0-map:0.80407	validation_1-map:0.80130
[598]	validation_0-map:0.80408	validation_1-map:0.80131
[599]	validation_0-map:0.80410	validation_1-map:0.80132
[600]	validation_0-map:0.80412	validation_1-map:0.80135
[601]	validation_0-map:0.80414	validation_1-map:0.80137
[602]	validation_0-map:0.80417	validation_1-map:0.80139
[603]	validation_0-map:0.80422	validation_1-map:0.80141
[604]	validation_0-map:0.80426	validation_1-map:0.80145
[605]	validation_0-map:0.80425	validation_1-map:0.80143
[606]	validation_0-map:0.80426	validation_1-map:0.80144
[607]	validation_0-map:0.80430	validation_1-map:

[737]	validation_0-map:0.80663	validation_1-map:0.80312
[738]	validation_0-map:0.80662	validation_1-map:0.80311
[739]	validation_0-map:0.80667	validation_1-map:0.80316
[740]	validation_0-map:0.80667	validation_1-map:0.80316
[741]	validation_0-map:0.80667	validation_1-map:0.80316
[742]	validation_0-map:0.80668	validation_1-map:0.80316
[743]	validation_0-map:0.80670	validation_1-map:0.80319
[744]	validation_0-map:0.80670	validation_1-map:0.80319
[745]	validation_0-map:0.80670	validation_1-map:0.80319
[746]	validation_0-map:0.80675	validation_1-map:0.80323
[747]	validation_0-map:0.80677	validation_1-map:0.80325
[748]	validation_0-map:0.80682	validation_1-map:0.80327
[749]	validation_0-map:0.80682	validation_1-map:0.80327
[750]	validation_0-map:0.80682	validation_1-map:0.80328
[751]	validation_0-map:0.80685	validation_1-map:0.80329
[752]	validation_0-map:0.80686	validation_1-map:0.80329
[753]	validation_0-map:0.80687	validation_1-map:0.80330
[754]	validation_0-map:0.80687	validation_1-map:

[884]	validation_0-map:0.80832	validation_1-map:0.80434
[885]	validation_0-map:0.80832	validation_1-map:0.80434
[886]	validation_0-map:0.80836	validation_1-map:0.80437
[887]	validation_0-map:0.80840	validation_1-map:0.80439
[888]	validation_0-map:0.80842	validation_1-map:0.80441
[889]	validation_0-map:0.80842	validation_1-map:0.80441
[890]	validation_0-map:0.80843	validation_1-map:0.80441
[891]	validation_0-map:0.80844	validation_1-map:0.80443
[892]	validation_0-map:0.80846	validation_1-map:0.80444
[893]	validation_0-map:0.80848	validation_1-map:0.80446
[894]	validation_0-map:0.80851	validation_1-map:0.80449
[895]	validation_0-map:0.80852	validation_1-map:0.80451
[896]	validation_0-map:0.80853	validation_1-map:0.80451
[897]	validation_0-map:0.80853	validation_1-map:0.80451
[898]	validation_0-map:0.80852	validation_1-map:0.80451
[899]	validation_0-map:0.80853	validation_1-map:0.80451
[900]	validation_0-map:0.80853	validation_1-map:0.80451
[901]	validation_0-map:0.80856	validation_1-map:

[1030]	validation_0-map:0.81004	validation_1-map:0.80548
[1031]	validation_0-map:0.81005	validation_1-map:0.80548
[1032]	validation_0-map:0.81007	validation_1-map:0.80550
[1033]	validation_0-map:0.81007	validation_1-map:0.80550
[1034]	validation_0-map:0.81008	validation_1-map:0.80549
[1035]	validation_0-map:0.81008	validation_1-map:0.80549
[1036]	validation_0-map:0.81009	validation_1-map:0.80550
[1037]	validation_0-map:0.81009	validation_1-map:0.80549
[1038]	validation_0-map:0.81011	validation_1-map:0.80550
[1039]	validation_0-map:0.81011	validation_1-map:0.80550
[1040]	validation_0-map:0.81011	validation_1-map:0.80551
[1041]	validation_0-map:0.81011	validation_1-map:0.80551
[1042]	validation_0-map:0.81012	validation_1-map:0.80551
[1043]	validation_0-map:0.81015	validation_1-map:0.80551
[1044]	validation_0-map:0.81017	validation_1-map:0.80553
[1045]	validation_0-map:0.81018	validation_1-map:0.80553
[1046]	validation_0-map:0.81019	validation_1-map:0.80554
[1047]	validation_0-map:0.81021

[1174]	validation_0-map:0.81129	validation_1-map:0.80617
[1175]	validation_0-map:0.81131	validation_1-map:0.80618
[1176]	validation_0-map:0.81131	validation_1-map:0.80618
[1177]	validation_0-map:0.81131	validation_1-map:0.80618
[1178]	validation_0-map:0.81131	validation_1-map:0.80617
[1179]	validation_0-map:0.81131	validation_1-map:0.80617
[1180]	validation_0-map:0.81131	validation_1-map:0.80617
[1181]	validation_0-map:0.81131	validation_1-map:0.80617
[1182]	validation_0-map:0.81133	validation_1-map:0.80619
[1183]	validation_0-map:0.81134	validation_1-map:0.80619
[1184]	validation_0-map:0.81134	validation_1-map:0.80618
[1185]	validation_0-map:0.81135	validation_1-map:0.80619
[1186]	validation_0-map:0.81136	validation_1-map:0.80620
[1187]	validation_0-map:0.81137	validation_1-map:0.80621
[1188]	validation_0-map:0.81137	validation_1-map:0.80621
[1189]	validation_0-map:0.81137	validation_1-map:0.80621
[1190]	validation_0-map:0.81140	validation_1-map:0.80622
[1191]	validation_0-map:0.81141

[1318]	validation_0-map:0.81250	validation_1-map:0.80681
[1319]	validation_0-map:0.81250	validation_1-map:0.80681
[1320]	validation_0-map:0.81251	validation_1-map:0.80681
[1321]	validation_0-map:0.81251	validation_1-map:0.80681
[1322]	validation_0-map:0.81252	validation_1-map:0.80682
[1323]	validation_0-map:0.81252	validation_1-map:0.80682
[1324]	validation_0-map:0.81252	validation_1-map:0.80682
[1325]	validation_0-map:0.81253	validation_1-map:0.80682
[1326]	validation_0-map:0.81255	validation_1-map:0.80684
[1327]	validation_0-map:0.81255	validation_1-map:0.80684
[1328]	validation_0-map:0.81255	validation_1-map:0.80684
[1329]	validation_0-map:0.81256	validation_1-map:0.80684
[1330]	validation_0-map:0.81258	validation_1-map:0.80686
[1331]	validation_0-map:0.81259	validation_1-map:0.80686
[1332]	validation_0-map:0.81259	validation_1-map:0.80686
[1333]	validation_0-map:0.81259	validation_1-map:0.80686
[1334]	validation_0-map:0.81259	validation_1-map:0.80686
[1335]	validation_0-map:0.81259

[1462]	validation_0-map:0.81352	validation_1-map:0.80739
[1463]	validation_0-map:0.81352	validation_1-map:0.80738
[1464]	validation_0-map:0.81351	validation_1-map:0.80738
[1465]	validation_0-map:0.81352	validation_1-map:0.80739
[1466]	validation_0-map:0.81352	validation_1-map:0.80739
[1467]	validation_0-map:0.81353	validation_1-map:0.80739
[1468]	validation_0-map:0.81354	validation_1-map:0.80739
[1469]	validation_0-map:0.81354	validation_1-map:0.80739
[1470]	validation_0-map:0.81355	validation_1-map:0.80740
[1471]	validation_0-map:0.81358	validation_1-map:0.80742
[1472]	validation_0-map:0.81358	validation_1-map:0.80742
[1473]	validation_0-map:0.81358	validation_1-map:0.80742
[1474]	validation_0-map:0.81361	validation_1-map:0.80743
[1475]	validation_0-map:0.81361	validation_1-map:0.80743
[1476]	validation_0-map:0.81361	validation_1-map:0.80744
[1477]	validation_0-map:0.81361	validation_1-map:0.80744
[1478]	validation_0-map:0.81361	validation_1-map:0.80744
[1479]	validation_0-map:0.81364

[1606]	validation_0-map:0.81428	validation_1-map:0.80774
[1607]	validation_0-map:0.81429	validation_1-map:0.80774
[1608]	validation_0-map:0.81429	validation_1-map:0.80774
[1609]	validation_0-map:0.81429	validation_1-map:0.80775
[1610]	validation_0-map:0.81429	validation_1-map:0.80775
[1611]	validation_0-map:0.81429	validation_1-map:0.80775
[1612]	validation_0-map:0.81430	validation_1-map:0.80775
[1613]	validation_0-map:0.81430	validation_1-map:0.80775
[1614]	validation_0-map:0.81431	validation_1-map:0.80775
[1615]	validation_0-map:0.81433	validation_1-map:0.80777
[1616]	validation_0-map:0.81435	validation_1-map:0.80777
[1617]	validation_0-map:0.81436	validation_1-map:0.80778
[1618]	validation_0-map:0.81438	validation_1-map:0.80779
[1619]	validation_0-map:0.81438	validation_1-map:0.80779
[1620]	validation_0-map:0.81438	validation_1-map:0.80779
[1621]	validation_0-map:0.81438	validation_1-map:0.80779
[1622]	validation_0-map:0.81439	validation_1-map:0.80780
[1623]	validation_0-map:0.81440

[1750]	validation_0-map:0.81524	validation_1-map:0.80825
[1751]	validation_0-map:0.81525	validation_1-map:0.80825
[1752]	validation_0-map:0.81525	validation_1-map:0.80825
[1753]	validation_0-map:0.81525	validation_1-map:0.80825
[1754]	validation_0-map:0.81525	validation_1-map:0.80826
[1755]	validation_0-map:0.81525	validation_1-map:0.80825
[1756]	validation_0-map:0.81526	validation_1-map:0.80826
[1757]	validation_0-map:0.81529	validation_1-map:0.80827
[1758]	validation_0-map:0.81531	validation_1-map:0.80829
[1759]	validation_0-map:0.81531	validation_1-map:0.80829
[1760]	validation_0-map:0.81532	validation_1-map:0.80830
[1761]	validation_0-map:0.81532	validation_1-map:0.80830
[1762]	validation_0-map:0.81532	validation_1-map:0.80830
[1763]	validation_0-map:0.81532	validation_1-map:0.80830
[1764]	validation_0-map:0.81533	validation_1-map:0.80830
[1765]	validation_0-map:0.81532	validation_1-map:0.80830
[1766]	validation_0-map:0.81533	validation_1-map:0.80829
[1767]	validation_0-map:0.81535

[1894]	validation_0-map:0.81609	validation_1-map:0.80855
[1895]	validation_0-map:0.81609	validation_1-map:0.80855
[1896]	validation_0-map:0.81610	validation_1-map:0.80857
[1897]	validation_0-map:0.81611	validation_1-map:0.80857
[1898]	validation_0-map:0.81612	validation_1-map:0.80858
[1899]	validation_0-map:0.81614	validation_1-map:0.80859
[1900]	validation_0-map:0.81616	validation_1-map:0.80860
[1901]	validation_0-map:0.81616	validation_1-map:0.80860
[1902]	validation_0-map:0.81616	validation_1-map:0.80860
[1903]	validation_0-map:0.81616	validation_1-map:0.80860
[1904]	validation_0-map:0.81616	validation_1-map:0.80860
[1905]	validation_0-map:0.81616	validation_1-map:0.80860
[1906]	validation_0-map:0.81617	validation_1-map:0.80860
[1907]	validation_0-map:0.81616	validation_1-map:0.80859
[1908]	validation_0-map:0.81616	validation_1-map:0.80860
[1909]	validation_0-map:0.81617	validation_1-map:0.80860
[1910]	validation_0-map:0.81617	validation_1-map:0.80860
[1911]	validation_0-map:0.81618

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.25,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='map', feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=5, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=2000, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=0, ...)

### meu xgboost

In [None]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=25, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)

In [None]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

In [None]:
# train/test
xgb.fit(xtrain, ytrain)
ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

- ## pipeline?

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import MaxAbsScaler

In [None]:
#exported_pipeline = make_pipeline(
#    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=8, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)),
#    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.4, min_samples_leaf=9, min_samples_split=18, n_estimators=100)),
#    MultinomialNB(alpha=1.0, fit_prior=False)
#)
exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.45, min_samples_leaf=17, min_samples_split=10, n_estimators=100)),
    XGBClassifier(learning_rate=0.01, max_depth=2, min_child_weight=7, n_estimators=100, n_jobs=1, subsample=0.45, verbosity=0)
)

In [None]:
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

In [None]:
exported_pipeline.fit(xtrain, ytrain)
ypred = exported_pipeline.predict(xtest)
cm = confusion_matrix(ytest,ypred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
pipe_f1 = (2*tp)/(2*tp+fp+fn)
print(pipe_f1)

In [None]:
results = exported_pipeline.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,ypred)

In [None]:
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
pipe_f1 = (2*tp)/(2*tp+fp+fn)
print(pipe_f1)

# param optimize

In [None]:
from sklearn.model_selection import RandomizedSearchCV
model = XGBClassifier()
param_vals = {'max_depth': [200, 500, 800, 1100], 'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.001, 0.01, 0.1, 1, 10]}
random_rf = RandomizedSearchCV(estimator=model, param_distributions=param_vals,
                              n_iter=10, scoring='f1', cv=5,
                              refit=True, n_jobs=-1)

In [None]:
#Training and prediction
random_rf.fit(xtrain, ytrain)
preds = random_rf.best_estimator_.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

## ensembles
- https://towardsdatascience.com/ensemble-learning-using-scikit-learn-85c4531ff86a

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

- ## K-Nearest Neighbors

In [None]:
#fix asap
d = d.dropna()

In [None]:
X, y = d.drop("target", axis=1), d["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
#create new a knn model
knn = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)
#fit model to training data
knn_gs.fit(X_train, y_train)

In [None]:
#save best model
knn_best = knn_gs.best_estimator_
#check best n_neigbors value
print(knn_gs.best_params_)

- ## Random Forest

In [None]:
#create a new random forest classifier
rf = RandomForestClassifier()
#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)
#fit model to training data
rf_gs.fit(X_train, y_train)

In [None]:
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

- ## Logistic Regression

In [None]:
#create a new logistic regression model
log_reg = LogisticRegression()
#fit the model to the training data
log_reg.fit(X_train, y_train)

resultados dos 3

In [None]:
#test the three models with the test data and print their accuracy scores
print('knn: {}'.format(knn_best.score(X_test, y_test)))
print('rf: {}'.format(rf_best.score(X_test, y_test)))
print('log_reg: {}'.format(log_reg.score(X_test, y_test)))

- ## Voting Classifier
resultado do conjunto dos 3

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

In [None]:
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
ensemble.score(X_test, y_test)

In [None]:
ypred = ensemble.predict(xtest)
cm = confusion_matrix(ytest,ypred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ensemble_f1 = (2*tp)/(2*tp+fp+fn)
print(ensemble_f1)