In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [52]:
import pandas as pd
import pickle

from fastai import *
from fastai.text import *

from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss, zero_one_loss, accuracy_score

In [10]:
path = Path(".")

In [12]:
docs = load_data(path/"data", "data_clas_export.pkl")

In [13]:
docs

TextClasDataBunch;

Train: LabelList (384 items)
x: TextList
▁xxbos ▁xxmaj ▁contratante : ▁xxup ▁banco ▁xxup ▁de ▁xxup ▁brasília ▁xxup ▁ s . a . ▁- ▁xxup ▁brb ▁- ▁xxmaj ▁pro mit ente ▁xxmaj ▁contratada : ▁xxup ▁di gi sec ▁xxup ▁certifica ção ▁xxup ▁digital ▁xxup ▁eireli - me . ▁xxmaj ▁objeto : ▁xxmaj ▁aquisição ▁de ▁chaves ▁cr ip to gráfica s ▁e ▁ certificados ▁digitais ▁para ▁pessoa ▁f is ica ▁e ▁jurídica ▁e ▁para ▁servidor ▁de ▁rede ▁na ▁hi er ar qui a ▁da ▁i cp - brasil ▁com ▁garantia , ▁conforme ▁especificações ▁técnicas ▁mínimas ▁do ▁edital ▁e ▁seus ▁anexos , ▁bem ▁como ▁da ▁proposta ▁da ▁contratada . ▁xxmaj ▁modalidade ▁da ▁contratação : ▁pregão ▁eletrônico ▁xxup ▁brb ▁no ▁2018 ▁/ ▁007 . ▁xxmaj ▁vigência : ▁de ▁02 ▁/ ▁04 ▁/ ▁2018 ▁à ▁02 ▁/ ▁04 ▁/ ▁2019. ▁xxmaj ▁itens ▁registrado s : ▁1 , 2, 3, 4 ▁e ▁ 7. ▁xxmaj ▁valor ▁xxup ▁r $ ▁37.0 22, 06. ▁xxmaj ▁signatários : ▁pelo ▁xxup ▁brb , ▁xxmaj ▁ k á ti a ▁do ▁xxmaj ▁carmo ▁xxmaj ▁peixoto ▁de ▁xxmaj ▁queiroz ▁e ▁pela ▁contratada , ▁xxm

In [15]:
X_train = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.train_ds.x]

In [16]:
len(X_train)

384

In [17]:
X_valid = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.valid_ds.x]

In [19]:
y_train = docs.train_ds.y.items; y_train

array([11, 15, 15, 15, ..., 15,  8, 15, 14])

In [20]:
len(y_train)

384

In [22]:
y_valid = docs.valid_ds.y.items

In [23]:
len(X_train), y_train.shape, len(X_valid), y_valid.shape

(384, (384,), 96, (96,))

In [24]:
X_train_val = np.concatenate([X_train, X_valid]); X_train_val.shape

(480,)

In [26]:
y_train_val = np.concatenate([y_train, y_valid]); y_train_val.shape

(480,)

In [31]:
valid_fold = np.array([-1]*len(X_train) + [0]*len(X_valid));  valid_fold.shape

(480,)

In [53]:
ps = PredefinedSplit(valid_fold)

In [32]:
docs_test = load_data(path/"data", "test_data.pkl")

In [38]:
X_test = [[docs_test.vocab.itos[o] for o in doc.data] for doc in docs_test.train_ds.x]

In [39]:
len(X_test)

237

In [40]:
y_test = docs_test.train_ds.y.items; y_test

array([ 0, 15, 16, 14, ...,  7, 15, 15, 14])

In [41]:
len(X_test), y_test.shape

(237, (237,))

In [42]:
pipe_nb_count = Pipeline((
    ("vectorizer", CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_nb_tfidf = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_svc_count = Pipeline((
    ("vectorizer", CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))
    
pipe_svc_tfidf = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))

In [43]:
veczr_params = {
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "vectorizer__max_df": [0.5, 0.8, 1.],
    "vectorizer__min_df": [1, 2, 3],
    "vectorizer__max_features": [800000],    
}

nb_params = {
    "clf__alpha": [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1],
    "clf__fit_prior": [True, False]
}

nb_params.update(veczr_params)

svc_params = {
    "clf__penalty": ["l2"],
    "clf__C": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10],
    "clf__class_weight": ["balanced", None]
}

svc_params.update(veczr_params)

In [54]:
svc_count_search = RandomizedSearchCV(pipe_svc_count, param_distributions=svc_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42, cv=ps)

svc_tfidf_search = RandomizedSearchCV(pipe_svc_tfidf, param_distributions=svc_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42, cv=ps)


nb_count_search  = RandomizedSearchCV(pipe_nb_count, param_distributions=nb_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42, cv=ps)

nb_tfidf_search  = RandomizedSearchCV(pipe_nb_tfidf, param_distributions=nb_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42, cv=ps)

In [55]:
svc_count_search.fit(X_train_val, y_train_val)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   29.9s finished


[LibLinear]

RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=(('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=<function noop at 0x7f88e556da...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=2)))),
          fit_params=None, iid=False, n_iter=100, n_jobs=-2,
          param_distributions={'clf__penalty': ['l2'], 'clf__C': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10], 'clf__class_weight': ['balanced', None], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorizer__max_df': [0.5, 0.8, 1.0], 'vectorizer__min_df': [1, 2, 3], 'vectorizer__max_features': [800000]},
          pre_dispatch='2*n_jobs', random_stat

In [56]:
svc_tfidf_search.fit(X_train_val, y_train_val)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:    6.5s


[LibLinear]

[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   29.5s finished


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=(('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2',
        preprocessor=<function...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=2)))),
          fit_params=None, iid=False, n_iter=100, n_jobs=-2,
          param_distributions={'clf__penalty': ['l2'], 'clf__C': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10], 'clf__class_weight': ['balanced', None], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorizer__max_df': [0.5, 0.8, 1.0], 'vectorizer__min_df': [1, 2, 3], 'vectorizer__max_features': [800000]},
          pre_dispatch='2*n_jobs', random_stat

In [57]:
nb_count_search.fit(X_train_val, y_train_val)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   24.0s finished


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=(('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=<function noop at 0x7f88e556da...e556da70>, vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)))),
          fit_params=None, iid=False, n_iter=100, n_jobs=-2,
          param_distributions={'clf__alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1], 'clf__fit_prior': [True, False], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorizer__max_df': [0.5, 0.8, 1.0], 'vectorizer__min_df': [1, 2, 3], 'vectorizer__max_features': [800000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
 

In [58]:
nb_tfidf_search.fit(X_train_val, y_train_val)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:   24.4s finished


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
          error_score='raise-deprecating',
          estimator=Pipeline(memory=None,
     steps=(('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2',
        preprocessor=<function...e,
        vocabulary=None)), ('clf', MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)))),
          fit_params=None, iid=False, n_iter=100, n_jobs=-2,
          param_distributions={'clf__alpha': [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1], 'clf__fit_prior': [True, False], 'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vectorizer__max_df': [0.5, 0.8, 1.0], 'vectorizer__min_df': [1, 2, 3], 'vectorizer__max_features': [800000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
 

In [59]:
def save_model(model, name):
    pickle.dump(model, open(f'./models/{name}.pkl', "wb"))

In [60]:
svc_count_search.best_score_, svc_tfidf_search.best_score_, nb_count_search.best_score_, nb_tfidf_search.best_score_

(0.9066622062261599,
 0.9164970527506396,
 0.8871132157316369,
 0.8962854818424743)

In [61]:
save_model(svc_tfidf_search.best_estimator_, "svc"), save_model(nb_tfidf_search.best_estimator_, "nb")

(None, None)

In [62]:
svc_tfidf_search.best_params_, nb_tfidf_search.best_params_

({'vectorizer__ngram_range': (1, 1),
  'vectorizer__min_df': 1,
  'vectorizer__max_features': 800000,
  'vectorizer__max_df': 0.8,
  'clf__penalty': 'l2',
  'clf__class_weight': 'balanced',
  'clf__C': 1},
 {'vectorizer__ngram_range': (1, 3),
  'vectorizer__min_df': 1,
  'vectorizer__max_features': 800000,
  'vectorizer__max_df': 1.0,
  'clf__fit_prior': False,
  'clf__alpha': 0.0003})

In [64]:
target_names= docs.train_ds.y.classes

In [65]:
preds = svc_tfidf_search.predict(X_test)
print(classification_report(y_test, preds, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds))

                                                                                   precision    recall  f1-score   support

                                                                       CASA CIVIL     0.7647    0.7222    0.7429        18
                                          CONTROLADORIA GERAL DO DISTRITO FEDERAL     0.6667    1.0000    0.8000         2
                                           DEFENSORIA PÚBLICA DO DISTRITO FEDERAL     1.0000    1.0000    1.0000         8
                                                                  PODER EXECUTIVO     0.7500    0.9000    0.8182        10
                                                                PODER LEGISLATIVO     1.0000    1.0000    1.0000         1
       SECRETARIA DE ESTADO DE AGRICULTURA, ABASTECIMENTO E DESENVOLVIMENTO RURAL     0.7500    0.7500    0.7500         4
                                                  SECRETARIA DE ESTADO DE CULTURA     1.0000    0.8462    0.9167        13
               

In [66]:
preds = nb_tfidf_search.predict(X_test)
print(classification_report(y_test, preds, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds))

                                                                                   precision    recall  f1-score   support

                                                                       CASA CIVIL     0.7222    0.7222    0.7222        18
                                          CONTROLADORIA GERAL DO DISTRITO FEDERAL     0.6667    1.0000    0.8000         2
                                           DEFENSORIA PÚBLICA DO DISTRITO FEDERAL     1.0000    1.0000    1.0000         8
                                                                  PODER EXECUTIVO     0.6667    1.0000    0.8000        10
                                                                PODER LEGISLATIVO     0.2500    1.0000    0.4000         1
       SECRETARIA DE ESTADO DE AGRICULTURA, ABASTECIMENTO E DESENVOLVIMENTO RURAL     0.3333    0.2500    0.2857         4
                                                  SECRETARIA DE ESTADO DE CULTURA     1.0000    0.8462    0.9167        13
               