In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [172]:
import pandas as pd
import pickle

from fastai import *
from fastai.text import *

from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss, zero_one_loss, accuracy_score

In [164]:
path = "./data/clean/"

In [186]:
docs = (TextList.from_csv(Path(path), 'train.csv', cols='text')
                         .split_from_df(col=2)
                         .label_from_df(cols=0))

In [187]:
docs

LabelLists;

Train: LabelList (480 items)
x: TextList
xxbos o xxup diretor - xxup geral xxup do xxup departamento xxup de xxup trânsito xxup do xxup distrito xxup federal , no uso da 
  atribuição prevista no art . 3º , do xxmaj decreto nº 39.002 , de 24 de abril de 2018 , que regulamenta os arts . 44 
  e 45 da xxmaj lei xxmaj complementar nº 840 , de 23 de dezembro de 2011 , e conforme xxmaj processo xxup sei nº 00055- 
  xxunk / 2018 - 94 , resolve : xxup designar xxup juliana xxup maria xxup xxunk , xxmaj agente de xxmaj trânsito , matrícula 
  xxunk - 8 , para substituir xxup xxunk xxup douglas xxup correa , xxmaj agente de xxmaj trânsito , matrícula xxunk - 4 , xxunk , 
  símbolo xxup cne-06 , da xxmaj corregedoria , do xxmaj detran / xxup df , nos períodos de 07 a 26 / 01 / 2019 e 01 a 10 / 07 / 2019 , por 
  motivo de férias do titular . 
  xxup fabrício xxup lima xxup de xxup andrade xxup moura,xxbos xxup processo : 00150 - xxunk / 2018 - 72 ; xxup nota xxup de xxup empenho xx

In [190]:
docs.valid[0][1]

Category CASA CIVIL

In [192]:
X_train = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.train.x]

In [193]:
len(X_train)

480

In [194]:
X_test = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.valid.x]

In [195]:
y_train = docs.train.y.items; y_train

array([15,  6, 15, 11, ..., 14, 15,  5, 15])

In [196]:
len(y_train)

480

In [197]:
y_test = docs.valid.y.items

In [198]:
len(X_train), y_train.shape, len(X_test), y_test.shape

(480, (480,), 237, (237,))

In [199]:
pipe_nb_count = Pipeline((
    ("vectorizer", CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_nb_tfidf = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_svc_count = Pipeline((
    ("vectorizer", CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))
    
pipe_svc_tfidf = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))

In [200]:
veczr_params = {
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "vectorizer__max_df": [0.5, 0.8, 1.],
    "vectorizer__min_df": [1, 2, 3],
    "vectorizer__max_features": [800000],    
}

nb_params = {
    "clf__alpha": [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1],
    "clf__fit_prior": [True, False]
}

nb_params.update(veczr_params)

svc_params = {
    "clf__penalty": ["l2"],
    "clf__C": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10],
    "clf__class_weight": ["balanced", None]
}

svc_params.update(veczr_params)

In [201]:
svc_count_search = RandomizedSearchCV(pipe_svc_count, param_distributions=svc_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42)

svc_tfidf_search = RandomizedSearchCV(pipe_svc_tfidf, param_distributions=svc_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42)


nb_count_search  = RandomizedSearchCV(pipe_nb_count, param_distributions=nb_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42)

nb_tfidf_search  = RandomizedSearchCV(pipe_nb_tfidf, param_distributions=nb_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=True,
                                      verbose=2, random_state=42)

In [202]:
svc_count_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-2)]: Done 148 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:  2.8min finished


[LibLinear]

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=(('vectorizer',
                                              CountVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.int64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                               

In [203]:
svc_tfidf_search.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   15.9s
[Parallel(n_jobs=-2)]: Done 148 tasks      | elapsed:  1.4min


[LibLinear]

[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:  2.8min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=(('vectorizer',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                             

In [204]:
nb_count_search.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-2)]: Done 148 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:  2.7min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=(('vectorizer',
                                              CountVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.int64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                               

In [205]:
nb_tfidf_search.fit(X_train, y_train)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-2)]: Done 148 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:  2.6min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=(('vectorizer',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                             

In [206]:
def save_model(model, name):
    pickle.dump(model, open(f'./models/{name}.pkl', "wb"))

In [207]:
svc_count_search.best_score_, svc_tfidf_search.best_score_, nb_count_search.best_score_, nb_tfidf_search.best_score_

(0.843743490584035, 0.8766380503214294, 0.8468940686257834, 0.8725278725434356)

In [208]:
save_model(svc_tfidf_search.best_estimator_, "svc"), save_model(nb_tfidf_search.best_estimator_, "nb")

(None, None)

In [209]:
svc_tfidf_search.best_params_, nb_tfidf_search.best_params_

({'vectorizer__ngram_range': (1, 1),
  'vectorizer__min_df': 1,
  'vectorizer__max_features': 800000,
  'vectorizer__max_df': 0.5,
  'clf__penalty': 'l2',
  'clf__class_weight': 'balanced',
  'clf__C': 1},
 {'vectorizer__ngram_range': (1, 3),
  'vectorizer__min_df': 1,
  'vectorizer__max_features': 800000,
  'vectorizer__max_df': 0.5,
  'clf__fit_prior': False,
  'clf__alpha': 0.001})

In [211]:
target_names= docs.train.y.classes

In [213]:
preds = svc_tfidf_search.predict(X_test)
print(classification_report(y_test, preds, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds))

                                                                                   precision    recall  f1-score   support

                                                                       CASA CIVIL     0.8125    0.7222    0.7647        18
                                          CONTROLADORIA GERAL DO DISTRITO FEDERAL     0.5000    1.0000    0.6667         2
                                           DEFENSORIA PÚBLICA DO DISTRITO FEDERAL     1.0000    1.0000    1.0000         8
                                                                  PODER EXECUTIVO     0.6429    0.9000    0.7500        10
                                                                PODER LEGISLATIVO     1.0000    1.0000    1.0000         1
       SECRETARIA DE ESTADO DE AGRICULTURA, ABASTECIMENTO E DESENVOLVIMENTO RURAL     1.0000    0.7500    0.8571         4
                                                  SECRETARIA DE ESTADO DE CULTURA     1.0000    0.8462    0.9167        13
               

In [214]:
preds = nb_tfidf_search.predict(X_test)
print(classification_report(y_test, preds, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds))

                                                                                   precision    recall  f1-score   support

                                                                       CASA CIVIL     0.7222    0.7222    0.7222        18
                                          CONTROLADORIA GERAL DO DISTRITO FEDERAL     0.5000    1.0000    0.6667         2
                                           DEFENSORIA PÚBLICA DO DISTRITO FEDERAL     1.0000    1.0000    1.0000         8
                                                                  PODER EXECUTIVO     0.6667    1.0000    0.8000        10
                                                                PODER LEGISLATIVO     1.0000    1.0000    1.0000         1
       SECRETARIA DE ESTADO DE AGRICULTURA, ABASTECIMENTO E DESENVOLVIMENTO RURAL     0.4000    0.5000    0.4444         4
                                                  SECRETARIA DE ESTADO DE CULTURA     1.0000    0.8462    0.9167        13
               