In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

### upload and divide data

In [22]:
d = pd.read_csv("data/03-sem-town-shortname.csv")
d

Unnamed: 0,sales_channel_id,route_id,product_id,weight,pieces,target,state_AGUASCALIENTES,state_BAJA CALIFORNIA NORTE,state_BAJA CALIFORNIA SUR,state_CAMPECHE,...,brand_PUL,brand_RIC,brand_SAN,brand_SL,brand_SUA,brand_SUN,brand_THO,brand_TR,brand_TRI,brand_WON
0,0.0,0.203783,0.861310,0,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.114399,0.025384,0,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.124059,0.024763,0,1,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.107077,0.113427,2,2,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.107484,0.022121,2,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936659,0.3,0.498780,0.680900,0,1,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
936660,0.0,0.122127,0.021700,2,2,0.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
936661,0.0,0.123144,0.024003,1,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
936662,0.0,0.101688,0.961624,2,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
X, y = d.drop("target", axis=1), d["target"]

### prepare for cross validation and/or train/test split

In [None]:
# cross-validation splitting strategy
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [None]:
# train/test split
# https://towardsdatascience.com/understanding-the-confusion-matrix-and-how-to-implement-it-in-python-319202e0fe4d
test_size = 0.33
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = test_size)

# Modelos

### demo_python

- ## decision tree

In [None]:
dt = DecisionTreeClassifier(max_depth=25, random_state=1234)

cross validate

In [27]:
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,7.871566,0.305403,0.721212,0.725129,0.646077,0.68332,0.773734,0.78666,0.435799


train/test split

In [37]:
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
dt_f1 = (2*tp)/(2*tp+fp+fn)
print(dt_f1)

0.7499838370470094


- ## neural network

In [None]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)

cross validate

In [38]:
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

train/test split

In [39]:
nn.fit(xtrain, ytrain)
ypred = nn.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
nn_f1 = (2*tp)/(2*tp+fp+fn)
print(nn_f1)

0.7197332479324831


- ## gaussian naive bayes

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

- ## support vector classification

In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

### ensembles

In [28]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

- ## random forest

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, random_state=0)

cross validate

In [30]:
scores_clf = cross_validate(clf, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_clf)
pd.DataFrame(clf_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,29.844132,1.710647,0.707313,0.692153,0.668753,0.68025,0.769771,0.740903,0.41055


train/test split

In [36]:
clf.fit(xtrain, ytrain)
ypred = clf.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
clf_f1 = (2*tp)/(2*tp+fp+fn)
print(clf_f1)

0.7272877449399401


- ## adaboost

In [None]:
ada = AdaBoostClassifier(n_estimators=100)

cross validate

In [31]:
scores_ada = cross_validate(ada, X, y, cv=splitter, scoring=METRICS)
ada_scores = pd.DataFrame(scores_ada)
pd.DataFrame(ada_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,78.931976,3.694391,0.684444,0.681598,0.604668,0.640825,0.743154,0.753935,0.361231


train/test split

In [35]:
ada.fit(xtrain, ytrain)
ypred = ada.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
ada_f1 = (2*tp)/(2*tp+fp+fn)
print(ada_f1)

0.7219601776992067


- ## gradient boost

In [None]:
gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=1.0, max_depth=20, random_state=0).fit(X, y)
scores_gbc = cross_validate(gbc, X, y, cv=splitter, scoring=METRICS)
gbc_scores = pd.DataFrame(scores_gbc)
pd.DataFrame(gbc_scores.mean()).T

### outros

- ## xgboost
\
parameters taken from tpot automl algo (check automl nb)

In [7]:
from xgboost import XGBClassifier

In [24]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=25, min_child_weight=16, n_estimators=100, n_jobs=1, subsample=0.8500000000000001, verbosity=0)

cross-validate

In [None]:
scores_xgb = cross_validate(xgb, X, y, cv=splitter, scoring=METRICS)
xgb_scores = pd.DataFrame(scores_xgb)
pd.DataFrame(xgb_scores.mean()).T

train/test split

In [26]:
# train/test
xgb.fit(xtrain, ytrain)
ypred = xgb.predict(xtest)
cm = confusion_matrix(ytest,ypred) 
tp,fp,fn,tn = cm[0][0], cm[0][1], cm[1][0], cm[1][1]
f1 = (2*tp)/(2*tp+fp+fn)
print(f1)

0.7619161639645624
