In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

### upload and divide data

In [32]:
#d = pd.read_csv("dados/06-categoricas.csv")
d.head()

count    936640.0
mean          0.0
std           0.0
min           0.0
25%           0.0
50%           0.0
75%           0.0
max           0.0
Name: target, dtype: float64

In [None]:
# sample data? faster models
#d = d.sample(frac=0.01)

In [17]:
X, y = d.drop("target", axis=1), d["target"]

### prepare for cross validation and/or train/test split

In [21]:
# cross-validation splitting strategy
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [18]:
# train/test split
# https://towardsdatascience.com/understanding-the-confusion-matrix-and-how-to-implement-it-in-python-319202e0fe4d
test_size = 0.33
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = test_size, random_state=1234, stratify=y)

# Modelos

## demo_python

- ### decision tree

In [19]:
dt = DecisionTreeClassifier(max_depth=25, random_state=1234)

cross validate \
scenarios 1, 2, 3, 4, 5

In [22]:
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.359414,0.110348,,,,,,,


train/test split

In [27]:
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
dt_f1 = (2*tp)/(2*tp+fp+fn)
print(dt_f1)

IndexError: index 1 is out of bounds for axis 0 with size 1

- ### multi layer perceptron

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

- ### gaussian naive bayes

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

- ### support vector classification

In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

## outros modelos

- ### nearest neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')

In [None]:
nbrs.fit(xtrain)
ypred = nbrs.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nbrs_f1 = (2*tp)/(2*tp+fp+fn)
print(nbrs_f1)

- ### nearest centroid

In [None]:
from sklearn.neighbors import NearestCentroid
nc = NearestCentroid()

In [None]:
nbrs.fit(xtrain)
nc = nc.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nc_f1 = (2*tp)/(2*tp+fp+fn)
print(nc_f1)

## ensembles

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

- ### random forest

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, random_state=0)

cross validate

In [None]:
scores_clf = cross_validate(clf, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_clf)
pd.DataFrame(clf_scores.mean()).T

train/test split

In [None]:
clf.fit(xtrain, ytrain)
ypred = clf.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
clf_f1 = (2*tp)/(2*tp+fp+fn)
print(clf_f1)

- ### adaboost

In [None]:
ada = AdaBoostClassifier(n_estimators=100)

cross validate

In [None]:
scores_ada = cross_validate(ada, X, y, cv=splitter, scoring=METRICS)
ada_scores = pd.DataFrame(scores_ada)
pd.DataFrame(ada_scores.mean()).T

train/test split

In [None]:
ada.fit(xtrain, ytrain)
ypred = ada.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ada_f1 = (2*tp)/(2*tp+fp+fn)
print(ada_f1)

- ### gradient boost

In [None]:
gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=1.0, max_depth=20, random_state=0).fit(X, y)

cross validate

In [None]:
scores_gbc = cross_validate(gbc, X, y, cv=splitter, scoring=METRICS)
gbc_scores = pd.DataFrame(scores_gbc)
pd.DataFrame(gbc_scores.mean()).T

train/test split

In [None]:
gbc.fit(xtrain, ytrain)
ypred = gbc.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
gbc_f1 = (2*tp)/(2*tp+fp+fn)
print(gbc_f1)

## xgboost

In [11]:
from xgboost import XGBClassifier

- ### tpot automl

In [None]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=25, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)

cross-validate

In [None]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

train/test split \
SCENARIO 6 WINNER

In [13]:
# train/test
xgb.fit(xtrain, ytrain)
ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

[0]	validation_0-map:1.00000
[1]	validation_0-map:1.00000
[2]	validation_0-map:1.00000
[3]	validation_0-map:1.00000
[4]	validation_0-map:1.00000
[5]	validation_0-map:1.00000
[6]	validation_0-map:1.00000
[7]	validation_0-map:1.00000
[8]	validation_0-map:1.00000
[9]	validation_0-map:1.00000


NameError: name 'ypred' is not defined

- ### https://www.youtube.com/watch?v=GrJP9FLV3FE

In [12]:
xgb = XGBClassifier(objective = 'binary:logistic',seed=42)

cross-validate

In [None]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

train/test split \
SCENARIO 7 WINNER

In [13]:
# train/test
xgb.fit(xtrain, ytrain, verbose=True, early_stopping_rounds=10, eval_metric='map', eval_set=[(xtest,ytest)])
#ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

[0]	validation_0-map:1.00000
[1]	validation_0-map:1.00000
[2]	validation_0-map:1.00000
[3]	validation_0-map:1.00000
[4]	validation_0-map:1.00000
[5]	validation_0-map:1.00000
[6]	validation_0-map:1.00000
[7]	validation_0-map:1.00000
[8]	validation_0-map:1.00000
[9]	validation_0-map:1.00000


NameError: name 'ypred' is not defined

- ### https://www.youtube.com/watch?v=ap2SS0-XPcE

In [None]:
model_xgboost = XGBClassifier(learning_rate=0.1,
                                      max_depth=5,
                                      n_estimators=5000,
                                      subsample=0.5,
                                      colsample_bytree=0.5,
                                      eval_metric='map',
                                      verbosity=1)

eval_set = [(xtest, ytest)]

model_xgboost.fit(xtrain,
                  ytrain,
                  early_stopping_rounds=10,
                  eval_set=eval_set,
                  verbose=True)

In [None]:
y_train_pred = model_xgboost.predict(xtrain)
y_valid_pred = model_xgboost.predict(xtest)

#print("MAP Train: {:.4f}\nMAP Valid: {:.4f}".format(confusion_matrix(ytrain, y_train_pred),
#                                                    confusion_matrix(ytest, y_valid_pred)))

In [None]:
cm = confusion_matrix(ytest,y_valid_pred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

params list to be testes in gridsearch

In [None]:
learning_rate_list = [0.02, 0.05, 0.1]
max_depth_list = [2, 3, 5]
n_estimators_list = [1000, 2000, 3000]

params_dict = {"learning_rate": learning_rate_list,
               "max_depth": max_depth_list,
               "n_estimators": n_estimators_list}

num_combinations = 1
for v in params_dict.values(): num_combinations *= len(v) 

print(num_combinations)
params_dict

gridsearch

In [None]:
def my_f1_score(model, X, y): return f1_score(y, model.predict(X))

model_xgboost_hp = GridSearchCV(estimator=XGBClassifier(subsample=0.5,
                                                                colsample_bytree=0.25,
                                                                eval_metric='map',
                                                                use_label_encoder=False),
                                param_grid=params_dict,
                                cv=2,
                                scoring=my_f1_score,
                                return_train_score=True,
                                verbose=4)

model_xgboost_hp.fit(X, y)

In [None]:
df_cv_results = pd.DataFrame(model_xgboost_hp.cv_results_)
df_cv_results = df_cv_results[['rank_test_score','mean_test_score','mean_train_score',
                               'param_learning_rate', 'param_max_depth', 'param_n_estimators']]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results

In [None]:
# First sort by number of estimators as that would be x-axis
df_cv_results.sort_values(by='param_n_estimators', inplace=True)

# Find values of AUC for learning rate of 0.05 and different values of depth
lr_d2 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==2),:]
lr_d3 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==3),:]
lr_d5 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==5),:]
lr_d7 = df_cv_results.loc[(df_cv_results['param_learning_rate']==0.05) & (df_cv_results['param_max_depth']==7),:]

# Let us plot now
fig, ax = plt.subplots(figsize=(10,5))
lr_d2.plot(x='param_n_estimators', y='mean_test_score', label='Depth=2', ax=ax)
lr_d3.plot(x='param_n_estimators', y='mean_test_score', label='Depth=3', ax=ax)
lr_d5.plot(x='param_n_estimators', y='mean_test_score', label='Depth=5', ax=ax)
lr_d7.plot(x='param_n_estimators', y='mean_test_score', label='Depth=7', ax=ax)
plt.ylabel('Mean Validation AUC')
plt.title('Performance wrt # of Trees and Depth')

In [None]:
# First sort by learning rate as that would be x-axis
df_cv_results.sort_values(by='param_learning_rate', inplace=True)

# Find values of AUC for learning rate of 0.05 and different values of depth
lr_t3k_d2 = df_cv_results.loc[(df_cv_results['param_n_estimators']==3000) & (df_cv_results['param_max_depth']==2),:]

# Let us plot now
fig, ax = plt.subplots(figsize=(10,5))
lr_t3k_d2.plot(x='param_learning_rate', y='mean_test_score', label='Depth=2, Trees=3000', ax=ax)
plt.ylabel('Mean Validation AUC')
plt.title('Performance wrt learning rate')

# rank_test_score	mean_test_score	mean_train_score	param_learning_rate	param_max_depth	param_n_estimators
# 1	0.704338	0.722704	0.1	5	2000

In [None]:
model_xgboost_fin = XGBClassifier(learning_rate=0.1,
                                          max_depth=5,
                                          n_estimators=2000,
                                          subsample=0.5,
                                          colsample_bytree=0.25,
                                          eval_metric='map',
                                          verbosity=1,
                                          use_label_encoder=False)

# Passing both training and validation dataset as we want to plot AUC for both
eval_set = [(xtrain, ytrain),(xtest, ytest)]

model_xgboost_fin.fit(xtrain,
                  ytrain,
                  early_stopping_rounds=20,
                  eval_set=eval_set,
                  verbose=True)

## other models from tpot automl

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import MaxAbsScaler

In [None]:
#exported_pipeline = make_pipeline(
#    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=8, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)),
#    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.4, min_samples_leaf=9, min_samples_split=18, n_estimators=100)),
#    MultinomialNB(alpha=1.0, fit_prior=False)
#)
exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.45, min_samples_leaf=17, min_samples_split=10, n_estimators=100)),
    XGBClassifier(learning_rate=0.01, max_depth=2, min_child_weight=7, n_estimators=100, n_jobs=1, subsample=0.45, verbosity=0)
)

In [None]:
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

In [None]:
exported_pipeline.fit(xtrain, ytrain)
ypred = exported_pipeline.predict(xtest)
cm = confusion_matrix(ytest,ypred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
pipe_f1 = (2*tp)/(2*tp+fp+fn)
print(pipe_f1)

In [None]:
results = exported_pipeline.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,ypred)

In [None]:
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
pipe_f1 = (2*tp)/(2*tp+fp+fn)
print(pipe_f1)

## randomized search + xgboost

In [None]:
from sklearn.model_selection import RandomizedSearchCV
model = XGBClassifier()
param_vals = {'max_depth': [200, 500, 800, 1100], 'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.001, 0.01, 0.1, 1, 10]}
random_rf = RandomizedSearchCV(estimator=model, param_distributions=param_vals,
                              n_iter=10, scoring='f1', cv=5,
                              refit=True, n_jobs=-1)

In [None]:
#Training and prediction
random_rf.fit(xtrain, ytrain)
preds = random_rf.best_estimator_.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

## ensembles
- https://towardsdatascience.com/ensemble-learning-using-scikit-learn-85c4531ff86a

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

- ### K-Nearest Neighbors

In [None]:
#fix asap
d = d.dropna()

In [None]:
X, y = d.drop("target", axis=1), d["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
#create new a knn model
knn = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)
#fit model to training data
knn_gs.fit(X_train, y_train)

In [None]:
#save best model
knn_best = knn_gs.best_estimator_
#check best n_neigbors value
print(knn_gs.best_params_)

- ### Random Forest

In [None]:
#create a new random forest classifier
rf = RandomForestClassifier()
#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)
#fit model to training data
rf_gs.fit(X_train, y_train)

In [None]:
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

- ### Logistic Regression

In [None]:
#create a new logistic regression model
log_reg = LogisticRegression()
#fit the model to the training data
log_reg.fit(X_train, y_train)

results

In [None]:
#test the three models with the test data and print their accuracy scores
print('knn: {}'.format(knn_best.score(X_test, y_test)))
print('rf: {}'.format(rf_best.score(X_test, y_test)))
print('log_reg: {}'.format(log_reg.score(X_test, y_test)))

- ## Voting Classifier
joined result of the prior 3

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

In [None]:
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
ensemble.score(X_test, y_test)

In [None]:
ypred = ensemble.predict(xtest)
cm = confusion_matrix(ytest,ypred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ensemble_f1 = (2*tp)/(2*tp+fp+fn)
print(ensemble_f1)