In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

### upload and divide data

In [33]:
#d = pd.read_csv("data/03-pico-sem-town-shortname.csv")
#d = pd.read_csv("data/03-idf.csv")
dint = pd.read_csv("data/top-clients.csv")
#d = pd.read_csv("data/top-clients.csv")
#d = d.dropna()

In [34]:
d.isna().sum().sum()

0

In [None]:
d = d.sample(frac=0.01)

In [35]:
X, y = d.drop("target", axis=1), d["target"]
#Xi, yi = dint.drop("target", axis=1), dint["target"]

### prepare for cross validation and/or train/test split

In [36]:
# cross-validation splitting strategy
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [28]:
# train/test split
# https://towardsdatascience.com/understanding-the-confusion-matrix-and-how-to-implement-it-in-python-319202e0fe4d
test_size = 0.33
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = test_size, random_state=1234, stratify=y)
#it, xiv, yit, yiv = train_test_split(Xi, yi, test_size = test_size, random_state=1234)

# Modelos

### demo_python

- ## decision tree

In [12]:
dt = DecisionTreeClassifier(max_depth=25, random_state=1234)

cross validate

In [13]:
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.081314,0.013906,0.824156,0.79216,0.794376,0.793108,0.820455,0.846119,0.640223


train/test split

In [14]:
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
dt_f1 = (2*tp)/(2*tp+fp+fn)
print(dt_f1)

0.8257709683755647


- ## neural network

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)

cross validate

In [None]:
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

train/test split

In [None]:
nn.fit(xtrain, ytrain)
ypred = nn.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nn_f1 = (2*tp)/(2*tp+fp+fn)
print(nn_f1)

- ## gaussian naive bayes

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

- ## support vector classification

In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

- ## nearest neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')

In [None]:
nbrs.fit(xtrain)
ypred = nbrs.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nbrs_f1 = (2*tp)/(2*tp+fp+fn)
print(nbrs_f1)

- ## nearest centroid

In [None]:
from sklearn.neighbors import NearestCentroid
nc = NearestCentroid()
scores_nc = cross_validate(nc, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_nc)
pd.DataFrame(nc_scores.mean()).T

### ensembles

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

- ## random forest

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, random_state=0)

cross validate

In [None]:
scores_clf = cross_validate(clf, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_clf)
pd.DataFrame(clf_scores.mean()).T

train/test split

In [None]:
clf.fit(xtrain, ytrain)
ypred = clf.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
clf_f1 = (2*tp)/(2*tp+fp+fn)
print(clf_f1)

- ## adaboost

In [None]:
ada = AdaBoostClassifier(n_estimators=100)

cross validate

In [None]:
scores_ada = cross_validate(ada, X, y, cv=splitter, scoring=METRICS)
ada_scores = pd.DataFrame(scores_ada)
pd.DataFrame(ada_scores.mean()).T

train/test split

In [None]:
ada.fit(xtrain, ytrain)
ypred = ada.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ada_f1 = (2*tp)/(2*tp+fp+fn)
print(ada_f1)

- ## gradient boost

In [None]:
gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=1.0, max_depth=20, random_state=0).fit(X, y)
scores_gbc = cross_validate(gbc, X, y, cv=splitter, scoring=METRICS)
gbc_scores = pd.DataFrame(scores_gbc)
pd.DataFrame(gbc_scores.mean()).T

### tpot automl

- ## xgboost
    - https://www.youtube.com/watch?v=GrJP9FLV3FE&t=2807s

In [37]:
from xgboost import XGBClassifier

In [38]:
xgb = XGBClassifier(objective = 'binary:logistic',seed=42)
#learning_rate=0.1, max_depth=25, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0,

cross-validate

In [39]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.299152,0.016853,0.841016,0.840176,0.772759,0.804974,0.911706,0.891363,0.671184


train/test split

In [40]:
# train/test
xgb.fit(xtrain, ytrain, verbose=True, early_stopping_rounds=10, eval_metric='map', eval_set=[(xtest,ytest)])
ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

[0]	validation_0-map:0.83386
[1]	validation_0-map:0.84573
[2]	validation_0-map:0.84995
[3]	validation_0-map:0.85140
[4]	validation_0-map:0.85294
[5]	validation_0-map:0.85395
[6]	validation_0-map:0.85473
[7]	validation_0-map:0.85691
[8]	validation_0-map:0.85892
[9]	validation_0-map:0.86243
[10]	validation_0-map:0.86264
[11]	validation_0-map:0.86416
[12]	validation_0-map:0.86487
[13]	validation_0-map:0.86558
[14]	validation_0-map:0.86742
[15]	validation_0-map:0.86957
[16]	validation_0-map:0.87003
[17]	validation_0-map:0.87023
[18]	validation_0-map:0.87101
[19]	validation_0-map:0.87276
[20]	validation_0-map:0.87300
[21]	validation_0-map:0.87322
[22]	validation_0-map:0.87387
[23]	validation_0-map:0.87465
[24]	validation_0-map:0.87502
[25]	validation_0-map:0.87520
[26]	validation_0-map:0.87558
[27]	validation_0-map:0.87603
[28]	validation_0-map:0.87579
[29]	validation_0-map:0.87658
[30]	validation_0-map:0.87705
[31]	validation_0-map:0.87722
[32]	validation_0-map:0.87770
[33]	validation_0-ma

### meu xgboost

In [None]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=25, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)

In [None]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

In [None]:
# train/test
xgb.fit(xtrain, ytrain)
ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

- ## pipeline?

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import MaxAbsScaler

In [None]:
#exported_pipeline = make_pipeline(
#    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=8, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)),
#    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.4, min_samples_leaf=9, min_samples_split=18, n_estimators=100)),
#    MultinomialNB(alpha=1.0, fit_prior=False)
#)
exported_pipeline = make_pipeline(
    MaxAbsScaler(),
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.45, min_samples_leaf=17, min_samples_split=10, n_estimators=100)),
    XGBClassifier(learning_rate=0.01, max_depth=2, min_child_weight=7, n_estimators=100, n_jobs=1, subsample=0.45, verbosity=0)
)

In [None]:
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)

In [None]:
exported_pipeline.fit(xtrain, ytrain)
ypred = exported_pipeline.predict(xtest)
cm = confusion_matrix(ytest,ypred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
pipe_f1 = (2*tp)/(2*tp+fp+fn)
print(pipe_f1)

In [None]:
results = exported_pipeline.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,ypred)

In [None]:
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
pipe_f1 = (2*tp)/(2*tp+fp+fn)
print(pipe_f1)

# param optimize

In [None]:
from sklearn.model_selection import RandomizedSearchCV
model = XGBClassifier()
param_vals = {'max_depth': [200, 500, 800, 1100], 'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.001, 0.01, 0.1, 1, 10]}
random_rf = RandomizedSearchCV(estimator=model, param_distributions=param_vals,
                              n_iter=10, scoring='f1', cv=5,
                              refit=True, n_jobs=-1)

In [None]:
#Training and prediction
random_rf.fit(xtrain, ytrain)
preds = random_rf.best_estimator_.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

In [32]:
X.dtypes

short_name                         float64
town                               float64
product_id                         float64
client_id                          float64
route_id                           float64
sales_channel_id                   float64
week_number                          int64
weight                               int64
pieces                             float64
state_Top10                          int64
state_Top11-21                       int64
state_Top22-32                       int64
client_name_Bimbo Store              int64
client_name_Consignment              int64
client_name_Eatery                   int64
client_name_Fresh Market             int64
client_name_General Market/Mart      int64
client_name_Hospital/Pharmacy        int64
client_name_Individual               int64
client_name_NO IDENTIFICADO          int64
client_name_Oxxo Store               int64
client_name_Post                     int64
client_name_School                   int64
client_name

## ensembles
- https://towardsdatascience.com/ensemble-learning-using-scikit-learn-85c4531ff86a

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
#fix asap
d = d.dropna()

In [None]:
X, y = d.drop("target", axis=1), d["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
#create new a knn model
knn = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
params_knn = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)
#fit model to training data
knn_gs.fit(X_train, y_train)

In [None]:
#save best model
knn_best = knn_gs.best_estimator_
#check best n_neigbors value
print(knn_gs.best_params_)

In [None]:
#create a new random forest classifier
rf = RandomForestClassifier()
#create a dictionary of all values we want to test for n_estimators
params_rf = {'n_estimators': [50, 100, 200]}
#use gridsearch to test all values for n_estimators
rf_gs = GridSearchCV(rf, params_rf, cv=5)
#fit model to training data
rf_gs.fit(X_train, y_train)

In [None]:
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

In [None]:
#create a new logistic regression model
log_reg = LogisticRegression()
#fit the model to the training data
log_reg.fit(X_train, y_train)

In [None]:
#test the three models with the test data and print their accuracy scores
print('knn: {}'.format(knn_best.score(X_test, y_test)))
print('rf: {}'.format(rf_best.score(X_test, y_test)))
print('log_reg: {}'.format(log_reg.score(X_test, y_test)))

In [None]:
from sklearn.ensemble import VotingClassifier
#create a dictionary of our models
estimators=[('knn', knn_best), ('rf', rf_best), ('log_reg', log_reg)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')

In [None]:
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
ensemble.score(X_test, y_test)

In [None]:
ypred = ensemble.predict(xtest)
cm = confusion_matrix(ytest,ypred)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ensemble_f1 = (2*tp)/(2*tp+fp+fn)
print(ensemble_f1)

# aaaaaaaaaaaaaaaaaaaaaa

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

In [None]:
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=20, max_features=0.3, min_samples_leaf=4, min_samples_split=15, n_estimators=100, subsample=0.9000000000000001)),
    RobustScaler(),
    StackingEstimator(estimator=GaussianNB()),
    BernoulliNB(alpha=0.01, fit_prior=False)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1234)

In [None]:
exported_pipeline.fit(xtrain, ytrain)
results = exported_pipeline.predict(xtest)

In [None]:
cm = confusion_matrix(ytest,results)
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
aa_f1 = (2*tp)/(2*tp+fp+fn)
print(aa_f1)