In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import pandas as pd
import pickle

from fastai import *
from fastai.text import *

from pathlib import Path

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV

In [29]:
path = "./data/clean/"

In [52]:
docs = TextClasDataBunch.from_csv(Path(path), "train/train.csv", valid_pct=0.2,
                                  test="test/test.csv")

In [53]:
docs

TextClasDataBunch;

Train: LabelList (383 items)
x: TextList
xxbos xxup processo : 00060 - xxunk / 2018 - 11 . xxmaj partes : xxup distrito xxup federal , por intermédio de sua 
  xxup secretaria xxup de xxup estado xxup de xxup saúde xxup do xxup distrito xxup federal e a empresa xxup produtos 
  xxup roche xxup xxunk e xxup xxunk s xxup a. xxup cnpj xxmaj nº xxunk / 0002 - 04 . xxup objeto : 
  xxup aquisição xxup de xxup xxunk xxup xxunk xxup xxunk , conforme xxmaj ata de xxmaj registro de xxmaj preço nº 
  228 / xxup 2018-b xxup ses / xxup df e xxmaj pedido de xxmaj aquisição de xxmaj material nº 5 - 18 / xxup xxunk e xxmaj autorização de 
  xxmaj fornecimento de xxmaj material nº 5 - 18 / xxup xxunk . xxup valor : xxup r$ xxunk ( cento e trinta e sete mil 
  cinquenta e dois reais ) , xxup prazo xxup de xxup entrega : 100 % xxup em 10 xxup dias , xxup após o xxup recebimento xxup da 
  xxup ne xxup pelo xxup fornecedor . xxup os xxup dias xxup são xxup contados xxup corridos . xxm

In [65]:
docs.train_ds[0][1].data

14

In [73]:
X_train = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.train_ds.x]

In [78]:
len(X_train)

383

In [79]:
X_valid = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.valid_ds.x]

In [82]:
X_train = X_train + X_valid

In [83]:
X_test = [[docs.vocab.itos[o] for o in doc.data] for doc in docs.test_ds.x]

In [91]:
y_train = docs.train_ds.y.items; y_train

array([14, 15,  6, 16, ...,  5, 15, 15, 15])

In [92]:
len(y_train)

383

In [100]:
y_valid = docs.valid_ds.y.items
y_train = np.concatenate([y_train, y_valid])

In [102]:
y_test = docs.test_ds.y.items

In [103]:
len(X_train), y_train.shape, len(X_test), y_test.shape

(480, (480,), 237, (237,))

In [113]:
pipe_nb_count = Pipeline((
    ("vectorizer", CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_nb_tfidf = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", MultinomialNB(alpha=0.001, fit_prior=True))
))

pipe_svc_count = Pipeline((
    ("vectorizer", CountVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))
    
pipe_svc_tfidf = Pipeline((
    ("vectorizer", TfidfVectorizer(ngram_range=(1,3), preprocessor=noop, tokenizer=noop)),
    ("clf", LinearSVC(verbose=2, class_weight="balanced"))
))

In [120]:
veczr_params = {
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "vectorizer__max_df": [0.5, 0.8, 1.],
    "vectorizer__min_df": [1, 2, 3],
    "vectorizer__max_features": [800000],    
}

nb_params = {
    "clf__alpha": [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1],
    "clf__fit_prior": [True, False]
}

nb_params.update(veczr_params)

svc_params = {
    "clf__penalty": ["l2"],
    "clf__C": [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10],
    "clf__class_weight": ["balanced", None]
}

svc_params.update(veczr_params)

In [125]:
svc_count_search = RandomizedSearchCV(pipe_svc_count, param_distributions=svc_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=False,
                                      verbose=2, random_state=42)

svc_tfidf_search = RandomizedSearchCV(pipe_svc_tfidf, param_distributions=svc_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=False,
                                      verbose=2, random_state=42)


nb_count_search  = RandomizedSearchCV(pipe_nb_count, param_distributions=nb_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=False,
                                      verbose=2, random_state=42)

nb_tfidf_search  = RandomizedSearchCV(pipe_nb_tfidf, param_distributions=nb_params, scoring="f1_weighted",
                                      n_iter=100, n_jobs=-2, iid=False, refit=False,
                                      verbose=2, random_state=42)

In [None]:
%%timeit

svc_count_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-2)]: Done 148 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-2)]: Done 300 out of 300 | elapsed:  2.7min finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.


Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed:   14.8s


In [None]:
%%timeit

svc_tfidf_search.fit(X_train, y_train)

In [None]:
%%timeit

nb_count_search.fit(X_train, y_train)

In [None]:
%%timeit

nb_tfidf_search.fit(X_train, y_train)

In [None]:
def save_model(model, name):
    pickle.dump(model, open(f'./models/{name}.pkl', "wb"))