In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

In [11]:
d = pd.read_csv("data/03-sem-town-shortname.csv")
#d.head()
#d = d.sample(frac=0.1)

In [13]:
d = d.sample(frac=0.1)
#d.groupby('target').size()/d['target'].count()
d

Unnamed: 0,sales_channel_id,route_id,product_id,weight,pieces,target,state_AGUASCALIENTES,state_BAJA CALIFORNIA NORTE,state_BAJA CALIFORNIA SUR,state_CAMPECHE,...,brand_PUL,brand_RIC,brand_SAN,brand_SL,brand_SUA,brand_SUN,brand_THO,brand_TR,brand_TRI,brand_WON
34604,0.0,0.205613,0.861310,0,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52567,0.0,0.205105,0.862931,0,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
335785,0.0,0.107993,0.021380,2,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
489601,0.0,0.122839,0.024203,1,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431572,0.0,0.107891,0.022201,2,2,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65290,0.0,0.118263,0.864393,2,2,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
692066,0.0,0.286557,0.861310,0,1,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230496,0.0,0.459630,0.838728,1,0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19242,0.0,0.109721,0.022841,2,0,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


mini treatment

In [None]:
#d = d[['week_number','sales_depot_id','sales_channel_id','route_id','client_id','product_id','weight','target']]
#d = d[['sales_channel_id','route_id','product_id','pieces','target']]
#d = d[d.columns.drop(list(d.filter(regex='brand')))]
d = d.drop(columns=['weight'])
d.head()

In [14]:
X, y = d.drop("target", axis=1), d["target"]
#X, y = d.drop("Demanda_uni_equil", axis=1), d["Demanda_uni_equil"]

In [15]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [16]:
dt = DecisionTreeClassifier(max_depth=25, random_state=1234)
splitter = StratifiedKFold(10, random_state=1234, shuffle=True)
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,0.558047,0.034057,0.678902,0.669693,0.610107,0.638468,0.692916,0.738645,0.350801


In [None]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=200, random_state=1234)
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T

## ensembles

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, random_state=0)
#scores = cross_val_score(clf, X, y, cv=5)
scores_clf = cross_validate(clf, X, y, cv=splitter, scoring=METRICS)
clf_scores = pd.DataFrame(scores_clf)
pd.DataFrame(clf_scores.mean()).T

In [None]:
#ada = AdaBoostClassifier(n_estimators=100)
scores_ada = cross_validate(ada, X, y, cv=splitter, scoring=METRICS)
ada_scores = pd.DataFrame(scores_ada)
pd.DataFrame(ada_scores.mean()).T

In [None]:
gbc = GradientBoostingClassifier(n_estimators=250, learning_rate=1.0, max_depth=20, random_state=0).fit(X, y)
scores_gbc = cross_validate(gbc, X, y, cv=splitter, scoring=METRICS)
gbc_scores = pd.DataFrame(scores_gbc)
pd.DataFrame(gbc_scores.mean()).T