In [1]:
import pandas as pd

colunas = ['ROTULO_MANUAL', 'EMENTA_NORM']

df = pd.read_csv('../data/ementas_pre-processadas.csv', header=0, sep=',', quotechar='"', usecols=colunas)

In [2]:
rotulo = 'BAN'

In [3]:
df.loc[df.ROTULO_MANUAL != rotulo, 'ROTULO_MANUAL'] = 'NONE'

In [4]:
df.head()

Unnamed: 0,ROTULO_MANUAL,EMENTA_NORM
0,NONE,apelacao civel acao indenizacao danos morais c...
1,NONE,apelacao civel acao rescisao contratual cumula...
2,NONE,embargos declaracao apelacao civel inexistenci...
3,NONE,embargos declaracao apelacao civel acordao ape...
4,NONE,embargos declaracao apelacao civel previdencia...


In [5]:
groupby_rotulo = df.groupby('ROTULO_MANUAL')
groupby_rotulo.count()

Unnamed: 0_level_0,EMENTA_NORM
ROTULO_MANUAL,Unnamed: 1_level_1
BAN,807
NONE,7309


In [6]:
x = df['EMENTA_NORM'].values.astype('U')
y = df['ROTULO_MANUAL'].values

In [7]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate=True)),
    ('clf', LogisticRegression(solver='lbfgs'))
    #('clf', GaussianNB())
])

# Naive Bayes
text_clf.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...enalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False))])

In [9]:
# Predict Class
y_pred = text_clf.predict(x_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [10]:
accuracy

0.9354679802955665

In [27]:
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, y_pred)

cnf_matrix

array([[  92,  116],
       [  15, 1807]])

In [12]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         BAN       0.86      0.44      0.58       208
        NONE       0.94      0.99      0.97      1822

   micro avg       0.94      0.94      0.94      2030
   macro avg       0.90      0.72      0.77      2030
weighted avg       0.93      0.94      0.93      2030



In [19]:
from sklearn.model_selection import GridSearchCV

parameters = {
    #'vect__ngram_range': [(1, 1), (1, 2)],
    #'vect__max_df': [1.0, 0.90, 0.8],
    #'vect__min_df': [1, 0.1, 0.05, 0.10],
    'tfidf__use_idf': [True, False]
}

In [20]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=1, cv=3)
gs_clf = gs_clf.fit(x_train, y_train)

In [21]:
gs_clf.best_score_

0.9305076392311483

In [22]:
gs_clf.best_params_

{'tfidf__use_idf': True}